DATA201_projects/python_project_2.ipynb

569 lines
50 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a8d466b1",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1feb2733",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_6745/2268714868.py:1: DtypeWarning: Columns (18,20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv('~/Downloads/NYPD_Complaint_Data_Historic.csv')\n"
]
}
],
"source": [
"df = pd.read_csv('~/Downloads/NYPD_Complaint_Data_Historic.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5b1cdbba",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['CMPLNT_NUM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'CMPLNT_TO_DT',\n",
" 'CMPLNT_TO_TM', 'ADDR_PCT_CD', 'RPT_DT', 'KY_CD', 'OFNS_DESC', 'PD_CD',\n",
" 'PD_DESC', 'CRM_ATPT_CPTD_CD', 'LAW_CAT_CD', 'BORO_NM',\n",
" 'LOC_OF_OCCUR_DESC', 'PREM_TYP_DESC', 'JURIS_DESC', 'JURISDICTION_CODE',\n",
" 'PARKS_NM', 'HADEVELOPT', 'HOUSING_PSA', 'X_COORD_CD', 'Y_COORD_CD',\n",
" 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'TRANSIT_DISTRICT',\n",
" 'Latitude', 'Longitude', 'Lat_Lon', 'PATROL_BORO', 'STATION_NAME',\n",
" 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX'],\n",
" dtype='object')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns\n",
"# df.dtypes\n",
"# df.shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1ac30b35",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# According to the data dictionary, CMPLNT_NUM (Complaint Number) is randomly generated and persistent.\n",
"# Is it unique?\n",
"\n",
"df['CMPLNT_NUM'].is_unique"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f0c76e18",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False 7821537\n",
"True 3962\n",
"dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# It's not unique. That's unexpected.\n",
"\n",
"df.duplicated(subset = 'CMPLNT_NUM').value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "253ab2f0",
"metadata": {},
"outputs": [],
"source": [
"# Since CMPLNT_NUM is not unique, we can't use it as an index.\n",
"# Let's drop it.\n",
"\n",
"df.drop('CMPLNT_NUM', axis = 1, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "7859f04c",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CMPLNT_FR_DT</th>\n",
" <th>CMPLNT_FR_TM</th>\n",
" <th>CMPLNT_TO_DT</th>\n",
" <th>CMPLNT_TO_TM</th>\n",
" <th>ADDR_PCT_CD</th>\n",
" <th>RPT_DT</th>\n",
" <th>KY_CD</th>\n",
" <th>OFNS_DESC</th>\n",
" <th>PD_CD</th>\n",
" <th>PD_DESC</th>\n",
" <th>CRM_ATPT_CPTD_CD</th>\n",
" <th>LAW_CAT_CD</th>\n",
" <th>BORO_NM</th>\n",
" <th>LOC_OF_OCCUR_DESC</th>\n",
" <th>PREM_TYP_DESC</th>\n",
" <th>JURIS_DESC</th>\n",
" <th>JURISDICTION_CODE</th>\n",
" <th>PARKS_NM</th>\n",
" <th>HADEVELOPT</th>\n",
" <th>HOUSING_PSA</th>\n",
" <th>X_COORD_CD</th>\n",
" <th>Y_COORD_CD</th>\n",
" <th>SUSP_AGE_GROUP</th>\n",
" <th>SUSP_RACE</th>\n",
" <th>SUSP_SEX</th>\n",
" <th>TRANSIT_DISTRICT</th>\n",
" <th>Latitude</th>\n",
" <th>Longitude</th>\n",
" <th>Lat_Lon</th>\n",
" <th>PATROL_BORO</th>\n",
" <th>STATION_NAME</th>\n",
" <th>VIC_AGE_GROUP</th>\n",
" <th>VIC_RACE</th>\n",
" <th>VIC_SEX</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>12/31/2019</td>\n",
" <td>17:30:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>32.0</td>\n",
" <td>12/31/2019</td>\n",
" <td>118</td>\n",
" <td>DANGEROUS WEAPONS</td>\n",
" <td>793.0</td>\n",
" <td>WEAPONS POSSESSION 3</td>\n",
" <td>COMPLETED</td>\n",
" <td>FELONY</td>\n",
" <td>MANHATTAN</td>\n",
" <td>NaN</td>\n",
" <td>STREET</td>\n",
" <td>N.Y. POLICE DEPT</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>999937.0</td>\n",
" <td>238365.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>40.820927</td>\n",
" <td>-73.943324</td>\n",
" <td>(40.82092679700002, -73.94332421899996)</td>\n",
" <td>PATROL BORO MAN NORTH</td>\n",
" <td>NaN</td>\n",
" <td>UNKNOWN</td>\n",
" <td>UNKNOWN</td>\n",
" <td>E</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>12/29/2019</td>\n",
" <td>16:31:00</td>\n",
" <td>12/29/2019</td>\n",
" <td>16:54:00</td>\n",
" <td>47.0</td>\n",
" <td>12/29/2019</td>\n",
" <td>113</td>\n",
" <td>FORGERY</td>\n",
" <td>729.0</td>\n",
" <td>FORGERY,ETC.,UNCLASSIFIED-FELO</td>\n",
" <td>COMPLETED</td>\n",
" <td>FELONY</td>\n",
" <td>BRONX</td>\n",
" <td>NaN</td>\n",
" <td>STREET</td>\n",
" <td>N.Y. POLICE DEPT</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1022508.0</td>\n",
" <td>261990.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>40.885701</td>\n",
" <td>-73.861640</td>\n",
" <td>(40.885701406000074, -73.86164032499995)</td>\n",
" <td>PATROL BORO BRONX</td>\n",
" <td>NaN</td>\n",
" <td>UNKNOWN</td>\n",
" <td>UNKNOWN</td>\n",
" <td>E</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>12/15/2019</td>\n",
" <td>18:45:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>109.0</td>\n",
" <td>12/29/2019</td>\n",
" <td>578</td>\n",
" <td>HARRASSMENT 2</td>\n",
" <td>638.0</td>\n",
" <td>HARASSMENT,SUBD 3,4,5</td>\n",
" <td>COMPLETED</td>\n",
" <td>VIOLATION</td>\n",
" <td>QUEENS</td>\n",
" <td>FRONT OF</td>\n",
" <td>STREET</td>\n",
" <td>N.Y. POLICE DEPT</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1034178.0</td>\n",
" <td>209758.0</td>\n",
" <td>25-44</td>\n",
" <td>UNKNOWN</td>\n",
" <td>M</td>\n",
" <td>NaN</td>\n",
" <td>40.742281</td>\n",
" <td>-73.819824</td>\n",
" <td>(40.74228115600005, -73.81982408)</td>\n",
" <td>PATROL BORO QUEENS NORTH</td>\n",
" <td>NaN</td>\n",
" <td>25-44</td>\n",
" <td>WHITE HISPANIC</td>\n",
" <td>F</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" CMPLNT_FR_DT CMPLNT_FR_TM CMPLNT_TO_DT CMPLNT_TO_TM ADDR_PCT_CD \\\n",
"0 12/31/2019 17:30:00 NaN NaN 32.0 \n",
"1 12/29/2019 16:31:00 12/29/2019 16:54:00 47.0 \n",
"2 12/15/2019 18:45:00 NaN NaN 109.0 \n",
"\n",
" RPT_DT KY_CD OFNS_DESC PD_CD \\\n",
"0 12/31/2019 118 DANGEROUS WEAPONS 793.0 \n",
"1 12/29/2019 113 FORGERY 729.0 \n",
"2 12/29/2019 578 HARRASSMENT 2 638.0 \n",
"\n",
" PD_DESC CRM_ATPT_CPTD_CD LAW_CAT_CD BORO_NM \\\n",
"0 WEAPONS POSSESSION 3 COMPLETED FELONY MANHATTAN \n",
"1 FORGERY,ETC.,UNCLASSIFIED-FELO COMPLETED FELONY BRONX \n",
"2 HARASSMENT,SUBD 3,4,5 COMPLETED VIOLATION QUEENS \n",
"\n",
" LOC_OF_OCCUR_DESC PREM_TYP_DESC JURIS_DESC JURISDICTION_CODE \\\n",
"0 NaN STREET N.Y. POLICE DEPT 0.0 \n",
"1 NaN STREET N.Y. POLICE DEPT 0.0 \n",
"2 FRONT OF STREET N.Y. POLICE DEPT 0.0 \n",
"\n",
" PARKS_NM HADEVELOPT HOUSING_PSA X_COORD_CD Y_COORD_CD SUSP_AGE_GROUP \\\n",
"0 NaN NaN NaN 999937.0 238365.0 NaN \n",
"1 NaN NaN NaN 1022508.0 261990.0 NaN \n",
"2 NaN NaN NaN 1034178.0 209758.0 25-44 \n",
"\n",
" SUSP_RACE SUSP_SEX TRANSIT_DISTRICT Latitude Longitude \\\n",
"0 NaN NaN NaN 40.820927 -73.943324 \n",
"1 NaN NaN NaN 40.885701 -73.861640 \n",
"2 UNKNOWN M NaN 40.742281 -73.819824 \n",
"\n",
" Lat_Lon PATROL_BORO \\\n",
"0 (40.82092679700002, -73.94332421899996) PATROL BORO MAN NORTH \n",
"1 (40.885701406000074, -73.86164032499995) PATROL BORO BRONX \n",
"2 (40.74228115600005, -73.81982408) PATROL BORO QUEENS NORTH \n",
"\n",
" STATION_NAME VIC_AGE_GROUP VIC_RACE VIC_SEX \n",
"0 NaN UNKNOWN UNKNOWN E \n",
"1 NaN UNKNOWN UNKNOWN E \n",
"2 NaN 25-44 WHITE HISPANIC F "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(3)\n",
"# df.columns\n",
"# df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "5fd666ad",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"BORO CRIME_CLASS\n",
"BRONX MISDEMEANOR 1000078\n",
" FELONY 466248\n",
" VIOLATION 227655\n",
"BROOKLYN MISDEMEANOR 1249836\n",
" FELONY 754414\n",
" VIOLATION 308893\n",
"MANHATTAN MISDEMEANOR 1075687\n",
" FELONY 597184\n",
" VIOLATION 209421\n",
"QUEENS MISDEMEANOR 826883\n",
" FELONY 516528\n",
" VIOLATION 218301\n",
"STATEN ISLAND MISDEMEANOR 210270\n",
" FELONY 81032\n",
" VIOLATION 70589\n",
"Name: CRIME_CLASS, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Question 1\n",
"\n",
"# How does each borough compare according to the class of crime committed?\n",
"# But first, rename some columns to make the table more readable.\n",
"\n",
"df.rename(columns = {'LAW_CAT_CD': 'CRIME_CLASS', 'BORO_NM': 'BORO'}, inplace = True)\n",
"df.groupby(['BORO'])['CRIME_CLASS'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "3cbcbd7c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='BORO,CRIME_CLASS'>"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x360 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# A quick visualization of the above\n",
"\n",
"df.groupby(['BORO'])['CRIME_CLASS'].value_counts().plot(kind = 'bar', figsize = (10, 5))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "b07d2228",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6761"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# df.head(25)\n",
"# df['OFNS_DESC'].isna().sum()\n",
"df['PD_DESC'].isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "1ef4d375",
"metadata": {},
"outputs": [],
"source": [
"# Question 2\n",
"\n",
"# Some incidents occurred in NYC parks, playgrounds or greenspaces.\n",
"# What crimes were reported most often and where?\n",
"\n",
"# Again, let's begin by renaming columns.\n",
"\n",
"df.rename(columns = {'PARKS_NM': 'PUBLIC_SPACE',\n",
" 'PD_DESC': 'DESCRIPTION',\n",
" 'ADDR_PCT_CD': 'PRECINCT',\n",
" 'Lat_Lon': 'LOCATION',\n",
" 'CMPLNT_FR_DT': 'DATE',\n",
" 'CMPLNT_FR_TM': 'TIME'\n",
" }, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "b838819a",
"metadata": {},
"outputs": [],
"source": [
"# PD_DESC and OFNS_DESC are both descriptions of the incident.\n",
"# The former is more granular, according to the data dictionary.\n",
"# Also, it has fewer NaNs."
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "e9c16848",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CENTRAL PARK 1856\n",
"FLUSHING MEADOWS CORONA PARK 1532\n",
"CONEY ISLAND BEACH & BOARDWALK 1161\n",
"WASHINGTON SQUARE PARK 1063\n",
"RIVERSIDE PARK 680\n",
"PROSPECT PARK 616\n",
"UNION SQUARE PARK 599\n",
"MARCUS GARVEY PARK 469\n",
"RANDALL'S ISLAND PARK 454\n",
"SARA D. ROOSEVELT PARK 395\n",
"BRYANT PARK 354\n",
"ST. MARY'S PARK BRONX 354\n",
"CLAREMONT PARK 348\n",
"MACOMBS DAM PARK 341\n",
"CROTONA PARK 319\n",
"Name: PUBLIC_SPACE, dtype: int64"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Let's choose public spaces to compare.\n",
"\n",
"# df.head()\n",
"df['PUBLIC_SPACE'].sort_values(ascending = False).value_counts().head(15)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "91ae572a",
"metadata": {},
"outputs": [],
"source": [
"# Each of the top four have more than 1000 incidents.\n",
"# We'll pick Central Park and Coney Island."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}