# Working title: working subtitle

## initial remarks

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('~/python_class/DOB_Sustainability_Compliance_Map__Local_Law_33.csv')

## Part 1: Data Exploration

In [3]:
df.shape

(21681, 11)

In [4]:
df.head(3)

Unnamed: 0,Block,Lot,Building_Class,Tax_Class,Building_Count,DOF_Gross_Square_Footage,Address,BoroughName,BBL,ENERGY STAR Score,LetterScore
0,1,10,Y4,0,124,2598091,920 GRESHAM ROAD,MANHATTAN,1000010010,1,D
1,2,23,T2,0,1,24346,20 SOUTH STREET,MANHATTAN,1000020023,0,F
2,4,7501,R0,2,1,2542563,1 WATER STREET,MANHATTAN,1000047501,61,C


In [5]:
df.columns

Index(['Block', 'Lot', 'Building_Class', 'Tax_Class', 'Building_Count',
       'DOF_Gross_Square_Footage', 'Address', 'BoroughName', 'BBL',
       'ENERGY STAR Score', 'LetterScore'],
      dtype='object')

In [6]:
# Columns seem to be self-explanatory, except BBL. According to NYC OpenData:
# "Borough Block and Lot identifier as assigned by NYC Department of Finance".

In [7]:
# Is this dataset citywide? or just Manhattan?

df['BoroughName'].value_counts()

MANHATTAN        7858
BROOKLYN         5469
BRONX            4349
QUEENS           3659
STATEN ISLAND     346
Name: BoroughName, dtype: int64

In [8]:
# Missing data?

df.isna().sum()

Block                       0
Lot                         0
Building_Class              2
Tax_Class                   0
Building_Count              0
DOF_Gross_Square_Footage    0
Address                     7
BoroughName                 0
BBL                         0
ENERGY STAR Score           0
LetterScore                 0
dtype: int64

In [9]:
df[df['Building_Class'].isna()]

Unnamed: 0,Block,Lot,Building_Class,Tax_Class,Building_Count,DOF_Gross_Square_Footage,Address,BoroughName,BBL,ENERGY STAR Score,LetterScore
4254,1595,7501,,0,0,0,1330 5 AVENUE,MANHATTAN,1015950031,64,C
8124,3016,7502,,0,0,0,1926 LONGFELLOW AVENUE,BRONX,2030160038,100,A


In [10]:
df[df['Address'].isna()]

Unnamed: 0,Block,Lot,Building_Class,Tax_Class,Building_Count,DOF_Gross_Square_Footage,Address,BoroughName,BBL,ENERGY STAR Score,LetterScore
1228,506,12,W3,0,1,49475,,MANHATTAN,1005060012,10,D
7145,1734,1,I1,0,5,1017118,,MANHATTAN,1017340001,7,D
9225,2758,6,N9,0,1,17200,,BRONX,2027580006,89,A
9226,2758,36,N9,0,1,37060,,BRONX,2027580036,66,C
13711,1769,72,C1,0,1,30720,,BROOKLYN,-2147483648,0,F
15056,1602,13,C1,0,1,14720,,BROOKLYN,-2147483648,0,F
16381,3755,22,C1,0,1,25564,,BROOKLYN,-2147483648,0,F


In [11]:
# Missing Address is not a big deal because the rest of the values are complete.
# But missing Building Class could be significant.

# The two offending rows also have Building Count = 0.
# How is that possible, since they have Energy Star scores?

# In the next secion we may decide to drop those two rows.

## Part 2: Data Cleaning

In [12]:
# Which columns are informative?

df.dtypes

Block                        int64
Lot                          int64
Building_Class              object
Tax_Class                    int64
Building_Count               int64
DOF_Gross_Square_Footage     int64
Address                     object
BoroughName                 object
BBL                          int64
ENERGY STAR Score            int64
LetterScore                 object
dtype: object

In [13]:
# Interesting for analysis:

# DOF_Gross_Square_Footage
# ENERGY STAR Score
# LetterScore

# Other columns are less interesting:

# Building_Count is the number of buildings in one Block.
# A Block can have more than one Lot, but a Lot only has one Block.
# Block, Lot and BBL are identifiers assigned by the city.

# A good visual reference is the Digital Tax Map put out by the NYC Department of Finance:
# http://gis.nyc.gov/taxmap/map.htm

In [14]:
# Can any identifiers be used as an index?

df['Block'].is_unique

False

In [15]:
df['Lot'].is_unique

False

In [16]:
df['BBL'].is_unique

False

In [17]:
# Since their values are not unique, they cannot be used as an index.

In [18]:
# Shall we rename or discard any columns from this dataset?

# BBL could be eliminated. However, there are only 11 columns total, and since df.head() is easily readable on my monitor without scrolling horizontally (as you're doing now), I see no harm in keeping it.

In [19]:
# Rename columns containing whitespace or camelcase

df.rename(columns = {"BoroughName": "Borough_Name",
                     "ENERGY STAR Score": "Energy_Star_Score",
                     "LetterScore": "Letter_Score"
                    }, inplace = True)

In [20]:
df.head(1)

Unnamed: 0,Block,Lot,Building_Class,Tax_Class,Building_Count,DOF_Gross_Square_Footage,Address,Borough_Name,BBL,Energy_Star_Score,Letter_Score
0,1,10,Y4,0,124,2598091,920 GRESHAM ROAD,MANHATTAN,1000010010,1,D


In [21]:
# Unforseen consequence of renaming: now I have to scroll horizontally.

In [22]:
# Rename columns once more

df.rename(columns = {"DOF_Gross_Square_Footage": "Sq_Footage",
                     "Energy_Star_Score": "Energy_Score",
                     "Borough_Name": "Borough",
                     "Building_Class": "Bldg_Class",
                     "Building_Count": "Bldg_Count"
                    }, inplace = True)

In [23]:
df.head(1)

Unnamed: 0,Block,Lot,Bldg_Class,Tax_Class,Bldg_Count,Sq_Footage,Address,Borough,BBL,Energy_Score,Letter_Score
0,1,10,Y4,0,124,2598091,920 GRESHAM ROAD,MANHATTAN,1000010010,1,D


In [24]:
# Repeat the search for missing data

df.isna().sum()

Block           0
Lot             0
Bldg_Class      2
Tax_Class       0
Bldg_Count      0
Sq_Footage      0
Address         7
Borough         0
BBL             0
Energy_Score    0
Letter_Score    0
dtype: int64

In [25]:
# Ignore the 7 missing addresses, but drop the 2 rows with missing Building Class.
# Building Class is a feature that will be used in the df.groupby() function.

df.dropna(subset = ['Bldg_Class'], inplace = True)

In [26]:
df.isna().sum()

Block           0
Lot             0
Bldg_Class      0
Tax_Class       0
Bldg_Count      0
Sq_Footage      0
Address         7
Borough         0
BBL             0
Energy_Score    0
Letter_Score    0
dtype: int64

In [28]:
# Search for unexpected data

# df['Energy_Score'].min() # looks good
# df['Energy_Score'].max() # looks good
# df['Sq_Footage'].max() # looks good
df['Sq_Footage'].min()

0

In [29]:
# A building cannot have zero square feet of floorspace.
# What's going on?

df[df['Sq_Footage'] == 0].groupby(['Letter_Score']).count()

Unnamed: 0_level_0,Block,Lot,Bldg_Class,Tax_Class,Bldg_Count,Sq_Footage,Address,Borough,BBL,Energy_Score
Letter_Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,3,3,3,3,3,3,3,3,3,3
B,1,1,1,1,1,1,1,1,1,1
C,5,5,5,5,5,5,5,5,5,5
D,6,6,6,6,6,6,6,6,6,6
F,14,14,14,14,14,14,14,14,14,14


In [30]:
# The ones with F can be explained:
# An F grade means that the building owner "didnâ€™t submit required benchmarking information",
# according to Local Law 95 of 2019. So it's not that the building has no square footage,
# but that the data was not submitted. Thus the failing grade.

# We'll leave 0 square feet with F grade untouched.

# For more information, see https://www1.nyc.gov/site/buildings/codes/benchmarking.page

In [35]:
# What to do with the others?

df[(df['Sq_Footage'] == 0) & (df['Letter_Score'] != 'F')].groupby(['Letter_Score']).count()

Unnamed: 0_level_0,Block,Lot,Bldg_Class,Tax_Class,Bldg_Count,Sq_Footage,Address,Borough,BBL,Energy_Score
Letter_Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A,3,3,3,3,3,3,3,3,3,3
B,1,1,1,1,1,1,1,1,1,1
C,5,5,5,5,5,5,5,5,5,5
D,6,6,6,6,6,6,6,6,6,6


In [53]:
# 15 rows remain with 0 square feet of floorspace.
# Can we impute values from the mean square footage for each grade?

# (There must be an elegant way to do this. What you see below is not.)

In [67]:
# First, get averages

subset0 = df[['Letter_Score', 'Sq_Footage']]
subset1 = subset0[(subset0['Letter_Score'] != 'F') & (subset0['Sq_Footage'] != 0)]
subset1.groupby(['Letter_Score']).mean()

Unnamed: 0_level_0,Sq_Footage
Letter_Score,Unnamed: 1_level_1
A,111197.291071
B,133270.963702
C,128833.575964
D,108170.778312


In [77]:
# Assign variables, rounding to whole numbers

mean_A = 111197
mean_B = 133271
mean_C = 128834
mean_D = 108171

In [79]:
# Replace 0 values with mean_A, mean_B, etc.

df.loc[(df['Letter_Score'] == 'A') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_A
df.loc[(df['Letter_Score'] == 'B') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_B
df.loc[(df['Letter_Score'] == 'C') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_C
df.loc[(df['Letter_Score'] == 'D') & (df['Sq_Footage'] == 0), 'Sq_Footage'] = mean_D

In [81]:
# Now the only 0 values should be for F grades

df[df['Sq_Footage'] == 0].groupby(['Letter_Score']).count()

Unnamed: 0_level_0,Block,Lot,Bldg_Class,Tax_Class,Bldg_Count,Sq_Footage,Address,Borough,BBL,Energy_Score
Letter_Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
F,14,14,14,14,14,14,14,14,14,14


In [82]:
# delete this cell

df.dtypes

Block            int64
Lot              int64
Bldg_Class      object
Tax_Class        int64
Bldg_Count       int64
Sq_Footage       int64
Address         object
Borough         object
BBL              int64
Energy_Score     int64
Letter_Score    object
dtype: object

In [92]:
# Unexpected values, continued

df[df['Bldg_Count'] == 0]

Unnamed: 0,Block,Lot,Bldg_Class,Tax_Class,Bldg_Count,Sq_Footage,Address,Borough,BBL,Energy_Score,Letter_Score
11319,149,7502,U7,0,0,0,138 WILLOUGHBY STREET,BROOKLYN,-2147483648,0,F
11611,165,7504,U7,0,0,133271,35 HOYT STREET,BROOKLYN,-2147483648,75,B
13351,5804,2,U6,0,0,0,COLONIAL ROAD,BROOKLYN,-2147483648,0,F
14570,5322,4,V1,0,0,111197,23 OCEAN PARKWAY,BROOKLYN,-2147483648,100,A
14668,5799,59,D9,0,0,38315,3641 JOHNSON AVENUE,BRONX,2057990059,0,F
15726,4282,100,V1,0,0,0,25-70 REAR WHITESTONE EXPRESSWAY SR WEST,QUEENS,-2147483648,0,F


In [93]:
# How can a block have zero buildings?
# Again, we'll leave the F grades as is.

df[(df['Bldg_Count'] == 0) & (df['Letter_Score'] != 'F')]

Unnamed: 0,Block,Lot,Bldg_Class,Tax_Class,Bldg_Count,Sq_Footage,Address,Borough,BBL,Energy_Score,Letter_Score
11611,165,7504,U7,0,0,133271,35 HOYT STREET,BROOKLYN,-2147483648,75,B
14570,5322,4,V1,0,0,111197,23 OCEAN PARKWAY,BROOKLYN,-2147483648,100,A


In [94]:
# Have a peek at the Department of Finance Tax Map: http://gis.nyc.gov/taxmap/map.htm

# Looks like Bldg_Count = 1 for both. However, I'm not comfortable with imputing data
# by eyeballing it.

In [95]:
# Let's just drop them

df.drop([11611, 14570], inplace = True)