In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/CunyLaguardiaDataAnalytics/datasets/master/2014-15_To_2016-17_School-_Level_NYC_Regents_Report_For_All_Variables.csv')

In [71]:
df.shape

(212331, 15)

In [45]:
df.dtypes

School DBN                     object
School Name                    object
School Level                   object
Regents Exam                   object
Year                            int64
Total Tested                    int64
Mean Score                     object
Number Scoring Below 65        object
Percent Scoring Below 65       object
Number Scoring 65 or Above     object
Percent Scoring 65 or Above    object
Number Scoring 80 or Above     object
Percent Scoring 80 or Above    object
Number Scoring CR              object
Percent Scoring CR             object
dtype: object

In [63]:
# Time range should be 2014-2015 to 2016-2017 school year. Does the 'Year' column reflect this range?

print(df['Year'].min(), df['Year'].max())

2015 2017


In [66]:
# Is there any missing data?

df.isna().sum()

School DBN                      0
School Name                     0
School Level                    0
Regents Exam                   10
Year                            0
Total Tested                    0
Mean Score                      0
Number Scoring Below 65         0
Percent Scoring Below 65        0
Number Scoring 65 or Above      0
Percent Scoring 65 or Above     0
Number Scoring 80 or Above      0
Percent Scoring 80 or Above     0
Number Scoring CR               0
Percent Scoring CR              0
dtype: int64

In [79]:
df[df['Regents Exam'].isna()]

Unnamed: 0,School DBN,School Name,School Level,Regents Exam,Year,Total Tested,Mean Score,Number Scoring Below 65,Percent Scoring Below 65,Number Scoring 65 or Above,Percent Scoring 65 or Above,Number Scoring 80 or Above,Percent Scoring 80 or Above,Number Scoring CR,Percent Scoring CR
18654,84M478,Inwood Academy for Leadership Charter School,Secondary School,,2015,90,55.6,61,67.8,29,32.2,6,6.7,0,0
52756,84M478,Inwood Academy for Leadership Charter School,Secondary School,,2015,81,55.9,55,67.9,26,32.1,5,6.2,0,0
52757,84M478,Inwood Academy for Leadership Charter School,Secondary School,,2015,9,53.4,6,66.7,3,33.3,1,11.1,0,0
100865,84M478,Inwood Academy for Leadership Charter School,Secondary School,,2015,5,s,s,s,s,s,s,s,na,na
100866,84M478,Inwood Academy for Leadership Charter School,Secondary School,,2015,54,57.4,34,63,20,37,6,11.1,0,0
100867,84M478,Inwood Academy for Leadership Charter School,Secondary School,,2015,31,s,s,s,s,s,s,s,na,na
138300,84M478,Inwood Academy for Leadership Charter School,Secondary School,,2015,54,57.3,34,63,20,37,5,9.3,0,0
138301,84M478,Inwood Academy for Leadership Charter School,Secondary School,,2015,36,53.2,27,75,9,25,1,2.8,0,0
209785,84M478,Inwood Academy for Leadership Charter School,Secondary School,,2015,12,59,9,75,3,25,1,8.3,0,0
209786,84M478,Inwood Academy for Leadership Charter School,Secondary School,,2015,78,55.1,52,66.7,26,33.3,5,6.4,0,0


In [83]:
# Cleaning the data

# There are ten missing values. One school is responsible. We'll drop that school from the dataset

df.dropna(inplace = True)

In [137]:
# 'Mean Score'

# We expect integers or floats. Instead we have objects.
# Can we convert them to floats?

# Yes, but first we have to deal with the nonsense value 's'.
# We don't know what 's' means so let's make a subset then
# convert the mean scores to floats.

df = df[df['Mean Score'] != 's']
df['Mean Score'] = pd.to_numeric(df['Mean Score'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137101 entries, 1 to 212325
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   School DBN                   137101 non-null  object 
 1   School Name                  137101 non-null  object 
 2   School Level                 137101 non-null  object 
 3   Regents Exam                 137101 non-null  object 
 4   Year                         137101 non-null  int64  
 5   Total Tested                 137101 non-null  int64  
 6   Mean Score                   137101 non-null  float64
 7   Number Scoring Below 65      137101 non-null  object 
 8   Percent Scoring Below 65     137101 non-null  object 
 9   Number Scoring 65 or Above   137101 non-null  object 
 10  Percent Scoring 65 or Above  137101 non-null  object 
 11  Number Scoring 80 or Above   137101 non-null  object 
 12  Percent Scoring 80 or Above  137101 non-null  object 
 13 

In [142]:
# 'Percent Scoring 80 or Above': an interesting column.
# This column's data type also needs to be recast.

df['Percent Scoring 80 or Above'].dtype

dtype('O')

In [146]:
df['Percent Scoring 80 or Above'] = pd.to_numeric(df['Percent Scoring 80 or Above'])
df['Percent Scoring 80 or Above'].dtype

dtype('float64')

In [148]:
df

Unnamed: 0,School DBN,School Name,School Level,Regents Exam,Year,Total Tested,Mean Score,Number Scoring Below 65,Percent Scoring Below 65,Number Scoring 65 or Above,Percent Scoring 65 or Above,Number Scoring 80 or Above,Percent Scoring 80 or Above,Number Scoring CR,Percent Scoring CR
1,01M034,P.S. 034 Franklin D. Roosevelt,K-8,Living Environment,2015,16,77.9,1,6.3,15,93.8,7,43.8,na,na
2,01M034,P.S. 034 Franklin D. Roosevelt,K-8,Living Environment,2016,9,74.0,1,11.1,8,88.9,2,22.2,na,na
5,01M140,P.S. 140 Nathan Straus,K-8,Living Environment,2015,9,67.4,3,33.3,6,66.7,0,0.0,na,na
6,01M140,P.S. 140 Nathan Straus,K-8,Living Environment,2016,15,72.6,2,13.3,13,86.7,5,33.3,na,na
7,01M140,P.S. 140 Nathan Straus,K-8,Living Environment,2017,9,64.4,5,55.6,4,44.4,1,11.1,na,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212321,84X717,Icahn Charter School,K-8,Common Core Algebra,2016,6,87.2,0,0,6,100,6,100.0,6,100
212322,84X717,Icahn Charter School,K-8,Common Core Algebra,2017,6,87.0,0,0,6,100,6,100.0,6,100
212323,84X717,Icahn Charter School,K-8,Common Core Algebra,2017,6,88.3,0,0,6,100,5,83.3,6,100
212324,84X717,Icahn Charter School,K-8,Living Environment,2015,8,76.5,1,12.5,7,87.5,2,25.0,na,na
