Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
1124 views
ubuntu2004
Kernel: Python 3 (system-wide)
#Using the attached csv we are going to pracitce some default dataframe methods and functions as a refresher and then we'll take a look at the actual data set and clustering algorithm #Useeful video: https://www.youtube.com/watch?v=F6kmIpWWEdU&list=PLeo1K3hjS3uuASpe-1LjfG5f14Bnozjwy&index=2 #https://www.geeksforgeeks.org/create-a-new-column-in-pandas-dataframe-based-on-the-existing-columns/ #Using K Means Clustering, determine any insights from the 2012 arrest data.
import pandas as pd #allows us simple data access features
import numpy as np #allows us to use numpy specific attributes
import matplotlib.pyplot as plt #allows us to create plotting and visualizations for our program
import sklearn #imports the machine learning algorithm we desire
from sklearn.cluster import KMeans #imports the KMeans clustering algorithm specifically
nypd_data = pd.read_csv("nypd_arrest_2012.csv") #searches for a local csv file and converts into a pandas dataframe. #A dataframe is almost like an excel/sheets data structure that exists in your program. #A dataframe behaves like an object.
nypd_data.head() #prints the first 5 rows of data in our dataframe.
nypd_data.tail() #prints the last five data entries
nypd_data.shape #(rows,col)
(8296, 19)
nypd_data.dtypes #Tells us the type of data that each of our entries are.
ARREST_KEY int64 ARREST_DATE object PD_CD float64 PD_DESC object KY_CD float64 OFNS_DESC object LAW_CODE object LAW_CAT_CD object ARREST_BORO object ARREST_PRECINCT int64 JURISDICTION_CODE int64 AGE_GROUP object PERP_SEX object PERP_RACE object X_COORD_CD int64 Y_COORD_CD int64 Latitude float64 Longitude float64 New Georeferenced Column object dtype: object
nypd_data.describe() #gives us the mean, median, mode and standard deviation of our data
nypd_data.columns
Index(['ARREST_KEY', 'ARREST_DATE', 'PD_CD', 'PD_DESC', 'KY_CD', 'OFNS_DESC', 'LAW_CODE', 'LAW_CAT_CD', 'ARREST_BORO', 'ARREST_PRECINCT', 'JURISDICTION_CODE', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude', 'New Georeferenced Column'], dtype='object')
nypd_data[0:2] #Data Slicing here to access the first two rows of my dataframe
nypd_data["OFNS_DESC"][0:5] #I can call data by passing in the column name and I can even slice that data as well to get a piece of it
0 DANGEROUS DRUGS 1 DANGEROUS WEAPONS 2 ASSAULT 3 & RELATED OFFENSES 3 FELONY ASSAULT 4 FELONY ASSAULT Name: OFNS_DESC, dtype: object
#Grab the first 10 entries of the agegroup column
nypd_data["AGE_GROUP"][0:10]
0 25-44 1 45-64 2 25-44 3 45-64 4 18-24 5 <18 6 25-44 7 18-24 8 25-44 9 25-44 Name: AGE_GROUP, dtype: object
nypd_data[["AGE_GROUP", "PERP_SEX"]][0:10] #I can pass in multiple columns and slice them as well.
nypd_data.columns
Index(['ARREST_KEY', 'ARREST_DATE', 'PD_CD', 'PD_DESC', 'KY_CD', 'OFNS_DESC', 'LAW_CODE', 'LAW_CAT_CD', 'ARREST_BORO', 'ARREST_PRECINCT', 'JURISDICTION_CODE', 'AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 'X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude', 'New Georeferenced Column'], dtype='object')
#SLice the first ten rows of arrest precicnt and perp race nypd_data[["ARREST_PRECINCT", "PERP_RACE"]][0:10]
#SLice the first ten rows of arrest precicnt and arrest boro nypd_data[["ARREST_PRECINCT","ARREST_BORO"]][0:10]
#SLice the first ten rows of arrest precicnt and arrest boro and perp race nypd_data[["ARREST_PRECINCT","ARREST_BORO", "PERP_RACE"]][0:10]
#SLice the first ten rows of arrest precicnt and arrest boro and perp sex nypd_data[["ARREST_PRECINCT","ARREST_BORO", "PERP_SEX"]][0:10]
#SLice the LAST ten rows of arrest precicnt and perp sex nypd_data[["ARREST_PRECINCT","PERP_SEX"]][8285:]
#Values method allows us to see all the different types of data for our column nypd_data["PERP_SEX"].values
array(['F', 'M', 'M', ..., 'M', 'M', 'M'], dtype=object)
nypd_data["PERP_SEX"].value_counts()
M 6998 F 1298 Name: PERP_SEX, dtype: int64
#Use value counts on the following individual categories: OFNS DESC, ARRRST BORO & PERP RACE
nypd_data["OFNS_DESC"].value_counts()
ASSAULT 3 & RELATED OFFENSES 1516 FELONY ASSAULT 804 PETIT LARCENY 786 MISCELLANEOUS PENAL LAW 609 CRIMINAL MISCHIEF & RELATED OF 558 BURGLARY 520 DANGEROUS DRUGS 519 ROBBERY 492 DANGEROUS WEAPONS 427 OFFENSES AGAINST PUBLIC ADMINI 314 GRAND LARCENY 311 SEX CRIMES 161 POSSESSION OF STOLEN PROPERTY 133 VEHICLE AND TRAFFIC LAWS 129 OFF. AGNST PUB ORD SENSBLTY & 123 CRIMINAL TRESPASS 123 GRAND LARCENY OF MOTOR VEHICLE 82 MURDER & NON-NEGL. MANSLAUGHTE 71 INTOXICATED & IMPAIRED DRIVING 70 FORGERY 66 BURGLAR'S TOOLS 66 OFFENSES AGAINST THE PERSON 53 UNAUTHORIZED USE OF A VEHICLE 45 FOR OTHER AUTHORITIES 43 RAPE 37 OFFENSES INVOLVING FRAUD 33 OTHER STATE LAWS (NON PENAL LA 32 OTHER OFFENSES RELATED TO THEF 27 FRAUDS 25 GAMBLING 22 NYS LAWS-UNCLASSIFIED FELONY 22 OTHER TRAFFIC INFRACTION 19 INTOXICATED/IMPAIRED DRIVING 10 OTHER STATE LAWS 7 ARSON 6 THEFT-FRAUD 3 ADMINISTRATIVE CODE 3 ALCOHOLIC BEVERAGE CONTROL LAW 3 KIDNAPPING & RELATED OFFENSES 2 OFFENSES RELATED TO CHILDREN 2 ENDAN WELFARE INCOMP 2 HARRASSMENT 2 2 OFFENSES AGAINST PUBLIC SAFETY 2 MOVING INFRACTIONS 2 DISORDERLY CONDUCT 2 JOSTLING 1 CHILD ABANDONMENT/NON SUPPORT 1 HOMICIDE-NEGLIGENT,UNCLASSIFIE 1 Name: OFNS_DESC, dtype: int64
nypd_data["ARREST_BORO"].value_counts()
K 2291 B 1964 Q 1843 M 1811 S 387 Name: ARREST_BORO, dtype: int64
nypd_data["PERP_RACE"].value_counts()
BLACK 4046 WHITE HISPANIC 2210 WHITE 911 BLACK HISPANIC 703 ASIAN / PACIFIC ISLANDER 388 UNKNOWN 21 AMERICAN INDIAN/ALASKAN NATIVE 17 Name: PERP_RACE, dtype: int64
#Conditional Formatting
females = nypd_data[nypd_data["PERP_SEX"] == "F"]
females["OFNS_DESC"].value_counts()
ASSAULT 3 & RELATED OFFENSES 363 FELONY ASSAULT 211 PETIT LARCENY 128 CRIMINAL MISCHIEF & RELATED OF 103 MISCELLANEOUS PENAL LAW 72 ROBBERY 61 DANGEROUS DRUGS 42 OFFENSES AGAINST PUBLIC ADMINI 40 BURGLARY 40 DANGEROUS WEAPONS 37 GRAND LARCENY 35 SEX CRIMES 26 CRIMINAL TRESPASS 24 OFF. AGNST PUB ORD SENSBLTY & 18 INTOXICATED & IMPAIRED DRIVING 15 GRAND LARCENY OF MOTOR VEHICLE 12 VEHICLE AND TRAFFIC LAWS 11 POSSESSION OF STOLEN PROPERTY 9 OTHER OFFENSES RELATED TO THEF 9 MURDER & NON-NEGL. MANSLAUGHTE 6 OFFENSES AGAINST THE PERSON 6 UNAUTHORIZED USE OF A VEHICLE 5 FOR OTHER AUTHORITIES 3 FORGERY 3 BURGLAR'S TOOLS 3 OFFENSES INVOLVING FRAUD 2 OFFENSES RELATED TO CHILDREN 2 FRAUDS 2 OTHER STATE LAWS (NON PENAL LA 2 ALCOHOLIC BEVERAGE CONTROL LAW 1 INTOXICATED/IMPAIRED DRIVING 1 OTHER STATE LAWS 1 GAMBLING 1 NYS LAWS-UNCLASSIFIED FELONY 1 CHILD ABANDONMENT/NON SUPPORT 1 ARSON 1 THEFT-FRAUD 1 Name: OFNS_DESC, dtype: int64
#Use conditional formatting to create groups of data based on borough.
#bronx filtering bronx = nypd_data[nypd_data["ARREST_BORO"] == "B"]
#brooklyn filtering brooklyn = nypd_data[nypd_data["ARREST_BORO"] == "K"]
#staten island filtering staten = nypd_data[nypd_data["ARREST_BORO"] == "S"]
#queens filtering queens = nypd_data[nypd_data["ARREST_BORO"] == "Q"]
#manhattan filtering manhattan = nypd_data[nypd_data["ARREST_BORO"] == "M"]
#Use value counts to find the ofns desc by borough
#bronx value counts bronx["OFNS_DESC"].value_counts()
ASSAULT 3 & RELATED OFFENSES 436 FELONY ASSAULT 227 DANGEROUS DRUGS 172 CRIMINAL MISCHIEF & RELATED OF 136 ROBBERY 131 PETIT LARCENY 131 OFFENSES AGAINST PUBLIC ADMINI 108 DANGEROUS WEAPONS 103 BURGLARY 76 MISCELLANEOUS PENAL LAW 64 GRAND LARCENY 49 VEHICLE AND TRAFFIC LAWS 39 OFF. AGNST PUB ORD SENSBLTY & 38 SEX CRIMES 32 POSSESSION OF STOLEN PROPERTY 24 MURDER & NON-NEGL. MANSLAUGHTE 22 CRIMINAL TRESPASS 18 FOR OTHER AUTHORITIES 16 INTOXICATED & IMPAIRED DRIVING 14 OFFENSES AGAINST THE PERSON 14 RAPE 13 FORGERY 12 OFFENSES INVOLVING FRAUD 11 BURGLAR'S TOOLS 11 OTHER OFFENSES RELATED TO THEF 10 GAMBLING 9 GRAND LARCENY OF MOTOR VEHICLE 9 UNAUTHORIZED USE OF A VEHICLE 9 FRAUDS 6 OTHER TRAFFIC INFRACTION 5 NYS LAWS-UNCLASSIFIED FELONY 4 OTHER STATE LAWS (NON PENAL LA 4 ALCOHOLIC BEVERAGE CONTROL LAW 2 OFFENSES AGAINST PUBLIC SAFETY 2 INTOXICATED/IMPAIRED DRIVING 1 HOMICIDE-NEGLIGENT,UNCLASSIFIE 1 ENDAN WELFARE INCOMP 1 JOSTLING 1 DISORDERLY CONDUCT 1 KIDNAPPING & RELATED OFFENSES 1 ARSON 1 Name: OFNS_DESC, dtype: int64
#brooklyn value counts brooklyn["OFNS_DESC"].value_counts()
ASSAULT 3 & RELATED OFFENSES 371 FELONY ASSAULT 231 PETIT LARCENY 205 MISCELLANEOUS PENAL LAW 203 DANGEROUS WEAPONS 163 BURGLARY 144 CRIMINAL MISCHIEF & RELATED OF 138 DANGEROUS DRUGS 135 ROBBERY 109 GRAND LARCENY 88 OFFENSES AGAINST PUBLIC ADMINI 79 SEX CRIMES 46 POSSESSION OF STOLEN PROPERTY 43 GRAND LARCENY OF MOTOR VEHICLE 38 CRIMINAL TRESPASS 36 VEHICLE AND TRAFFIC LAWS 33 FORGERY 33 OFF. AGNST PUB ORD SENSBLTY & 28 BURGLAR'S TOOLS 27 INTOXICATED & IMPAIRED DRIVING 21 OFFENSES AGAINST THE PERSON 17 RAPE 13 GAMBLING 10 OTHER OFFENSES RELATED TO THEF 9 OFFENSES INVOLVING FRAUD 9 MURDER & NON-NEGL. MANSLAUGHTE 9 OTHER STATE LAWS (NON PENAL LA 8 FOR OTHER AUTHORITIES 8 NYS LAWS-UNCLASSIFIED FELONY 6 UNAUTHORIZED USE OF A VEHICLE 6 INTOXICATED/IMPAIRED DRIVING 5 OTHER TRAFFIC INFRACTION 4 FRAUDS 4 OTHER STATE LAWS 2 ADMINISTRATIVE CODE 2 HARRASSMENT 2 1 ALCOHOLIC BEVERAGE CONTROL LAW 1 DISORDERLY CONDUCT 1 ARSON 1 MOVING INFRACTIONS 1 Name: OFNS_DESC, dtype: int64
#staten value counts staten["OFNS_DESC"].value_counts()
ASSAULT 3 & RELATED OFFENSES 55 MISCELLANEOUS PENAL LAW 52 PETIT LARCENY 49 CRIMINAL MISCHIEF & RELATED OF 32 FELONY ASSAULT 29 BURGLARY 24 GRAND LARCENY 23 DANGEROUS DRUGS 17 DANGEROUS WEAPONS 16 OFFENSES AGAINST PUBLIC ADMINI 13 OFF. AGNST PUB ORD SENSBLTY & 10 POSSESSION OF STOLEN PROPERTY 10 VEHICLE AND TRAFFIC LAWS 9 ROBBERY 7 SEX CRIMES 6 GRAND LARCENY OF MOTOR VEHICLE 6 CRIMINAL TRESPASS 5 MURDER & NON-NEGL. MANSLAUGHTE 4 UNAUTHORIZED USE OF A VEHICLE 3 OTHER STATE LAWS (NON PENAL LA 3 INTOXICATED & IMPAIRED DRIVING 3 NYS LAWS-UNCLASSIFIED FELONY 2 OFFENSES AGAINST THE PERSON 2 ARSON 1 OTHER TRAFFIC INFRACTION 1 OFFENSES INVOLVING FRAUD 1 FRAUDS 1 OTHER OFFENSES RELATED TO THEF 1 RAPE 1 Name: OFNS_DESC, dtype: int64
#queens value counts queens["OFNS_DESC"].value_counts()
ASSAULT 3 & RELATED OFFENSES 386 MISCELLANEOUS PENAL LAW 213 FELONY ASSAULT 191 CRIMINAL MISCHIEF & RELATED OF 146 PETIT LARCENY 133 ROBBERY 120 BURGLARY 107 DANGEROUS WEAPONS 69 SEX CRIMES 60 GRAND LARCENY 59 DANGEROUS DRUGS 53 OFFENSES AGAINST PUBLIC ADMINI 39 POSSESSION OF STOLEN PROPERTY 28 INTOXICATED & IMPAIRED DRIVING 26 GRAND LARCENY OF MOTOR VEHICLE 21 VEHICLE AND TRAFFIC LAWS 19 UNAUTHORIZED USE OF A VEHICLE 18 OTHER STATE LAWS (NON PENAL LA 17 OFF. AGNST PUB ORD SENSBLTY & 17 CRIMINAL TRESPASS 15 OFFENSES AGAINST THE PERSON 13 MURDER & NON-NEGL. MANSLAUGHTE 11 FRAUDS 10 BURGLAR'S TOOLS 8 FOR OTHER AUTHORITIES 7 FORGERY 6 RAPE 6 OTHER TRAFFIC INFRACTION 6 OFFENSES INVOLVING FRAUD 5 NYS LAWS-UNCLASSIFIED FELONY 5 OTHER OFFENSES RELATED TO THEF 5 INTOXICATED/IMPAIRED DRIVING 4 OTHER STATE LAWS 4 THEFT-FRAUD 3 ARSON 3 OFFENSES RELATED TO CHILDREN 2 ENDAN WELFARE INCOMP 1 CHILD ABANDONMENT/NON SUPPORT 1 MOVING INFRACTIONS 1 ADMINISTRATIVE CODE 1 Name: OFNS_DESC, dtype: int64
#manhattan value counts manhattan["OFNS_DESC"].value_counts()
PETIT LARCENY 268 ASSAULT 3 & RELATED OFFENSES 268 BURGLARY 169 DANGEROUS DRUGS 142 FELONY ASSAULT 126 ROBBERY 125 CRIMINAL MISCHIEF & RELATED OF 106 GRAND LARCENY 92 MISCELLANEOUS PENAL LAW 77 DANGEROUS WEAPONS 76 OFFENSES AGAINST PUBLIC ADMINI 75 CRIMINAL TRESPASS 49 OFF. AGNST PUB ORD SENSBLTY & 30 VEHICLE AND TRAFFIC LAWS 29 POSSESSION OF STOLEN PROPERTY 28 MURDER & NON-NEGL. MANSLAUGHTE 25 BURGLAR'S TOOLS 20 SEX CRIMES 17 FORGERY 15 FOR OTHER AUTHORITIES 12 UNAUTHORIZED USE OF A VEHICLE 9 GRAND LARCENY OF MOTOR VEHICLE 8 OFFENSES INVOLVING FRAUD 7 OFFENSES AGAINST THE PERSON 7 INTOXICATED & IMPAIRED DRIVING 6 NYS LAWS-UNCLASSIFIED FELONY 5 RAPE 4 FRAUDS 4 OTHER TRAFFIC INFRACTION 3 GAMBLING 3 OTHER OFFENSES RELATED TO THEF 2 KIDNAPPING & RELATED OFFENSES 1 HARRASSMENT 2 1 OTHER STATE LAWS 1 Name: OFNS_DESC, dtype: int64
nypd_data["PERP_SEX"]
0 F 1 M 2 M 3 M 4 M .. 8291 M 8292 M 8293 M 8294 M 8295 M Name: PERP_SEX, Length: 8296, dtype: object
#Often times we have data that might not be suitable for our algorithm. #In this case our data is not numerical and so we need to convert it to be numerical. #Let's create a function that will assign a 1 to F and a 0 to M. def perp_gender(row): if(row == "F"): return 1 elif(row == "M"): return 0
nypd_data["GENDER_NUM"] = nypd_data["PERP_SEX"].apply(perp_gender)
nypd_data[["PERP_SEX","GENDER_NUM"]]
# For HW TONIGHT create a column for the different races groups. # Using a function and the apply method assign a numerical value to each different race group.