Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
YStrano
GitHub Repository: YStrano/DataScience_GA
Path: blob/master/lessons/lesson_09/code/Clustering and Regression.ipynb
1904 views
Kernel: Python [conda env:Anaconda3]

Clustering and Regression

%matplotlib inline from __future__ import print_function import random from matplotlib import pyplot as plt import numpy as np import pandas as pd import seaborn as sns from sklearn.cluster import DBSCAN from sklearn.linear_model import LinearRegression # Plot the data def set_colors(labels, colors='rgbykcm'): colored_labels = [] for label in labels: colored_labels.append(colors[label]) return colored_labels

Now we generate some sample data.

## Create some synthetic data from scipy.stats import multivariate_normal data = [] # dist = multivariate_normal(mean=[0,0], cov=[[0.5, 0.1],[0,0.01]]) dist = multivariate_normal(mean=[0, 0], cov=[[0.5, 0.5],[0,0.1]]) for i in range(150): p = list(dist.rvs()) data.append(dist.rvs()) # dist = multivariate_normal(mean=[1,5], cov=[[0.5, 0.2],[0,0.02]]) dist = multivariate_normal(mean=[1, 5], cov=[[0.5, 0.5],[0,0.1]]) for i in range(150): data.append(dist.rvs()) dist = multivariate_normal(mean=[2, 10], cov=[[0.5, 0.5],[0,0.1]]) for i in range(150): data.append(dist.rvs()) df = pd.DataFrame(data, columns=["x", "y"]) df.head() plt.scatter(df['x'], df['y']) plt.show()
C:\Users\jlandesman\Anaconda3\lib\site-packages\scipy\stats\_multivariate.py:651: RuntimeWarning: covariance is not positive-semidefinite. out = random_state.multivariate_normal(mean, cov, size)
Image in a Jupyter notebook

Find Clusters

# Fit a DBSCAN estimator estimator = DBSCAN(eps=0.8, min_samples=10) X = df[["x", "y"]] estimator.fit(X) # Clusters are given in the labels_ attribute labels = estimator.labels_ colors = set_colors(labels) plt.scatter(df['x'], df['y'], c=colors) plt.xlabel("x") plt.ylabel("y") plt.show()
Image in a Jupyter notebook

Add Cluster Labels back to the Data Frame and Fit a Linear Model

df["cluster"] = labels df = pd.concat([df, pd.get_dummies(df['cluster'], prefix="cluster")], axis=1) df.head()
model = LinearRegression() X = df[["x", "cluster_0", "cluster_1", "cluster_2"]] y = df['y'] model.fit(X, y) print(model.score(X, y))
0.9853551208165351
# Plot the model colors = set_colors(labels) plt.scatter(df['x'], df['y'], c=colors) plt.xlabel("x") plt.ylabel("y") plt.scatter(df["x"], model.predict(X), color='black') plt.show()
Image in a Jupyter notebook

Another example

## Create some synthetic data from scipy.stats import multivariate_normal data = [] # dist = multivariate_normal(mean=[0,0], cov=[[0.5, 0.1],[0,0.01]]) dist = multivariate_normal(mean=[0, 0], cov=[[0.1, 0.5],[0,0.2]]) for i in range(150): p = list(dist.rvs()) data.append(dist.rvs()) dist = multivariate_normal(mean=[1, 5], cov=[[0.6, 0.],[0.2 ,0.1]]) for i in range(150): data.append(dist.rvs()) dist = multivariate_normal(mean=[2, 10], cov=[[0.5, 0.5],[0,0.1]]) for i in range(150): data.append(dist.rvs()) df = pd.DataFrame(data, columns=["x", "y"]) df.head() plt.scatter(df['x'], df['y']) plt.show()
C:\Users\jlandesman\Anaconda3\lib\site-packages\scipy\stats\_multivariate.py:651: RuntimeWarning: covariance is not positive-semidefinite. out = random_state.multivariate_normal(mean, cov, size)
Image in a Jupyter notebook
# Fit a DBSCAN estimator estimator = DBSCAN(eps=0.8, min_samples=10) X = df[["x", "y"]] estimator.fit(X) # Clusters are given in the labels_ attribute labels = estimator.labels_ df["cluster"] = labels colors = set_colors(labels) plt.scatter(df['x'], df['y'], c=colors) plt.xlabel("x") plt.ylabel("y") plt.show()
Image in a Jupyter notebook

Modeling

This time we have to fit a model to each cluster since they are not the same shape with offsets.

from collections import Counter counts = Counter(labels) print(counts)
Counter({0: 150, 1: 149, 2: 149, -1: 2})
# Plot the raw data plt.scatter(df['x'], df['y'], c=colors) plt.xlabel("x") plt.ylabel("y") # Fit a model to each cluster models = dict() for label in set(labels): if counts[label] > 10: model = LinearRegression() subdf = df[df["cluster"] == label] X = subdf[['x']] y = subdf[['y']] model.fit(X, y) models["label"] = model plt.scatter(X, model.predict(X), color="black") plt.show()
Image in a Jupyter notebook