Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
13 views
# Tami Gabriely # 10.7.15 # HW3 import pandas as pd import numpy as np import matplotlib.pyplot as plt from scipy import stats import statsmodels.api as sm # Function to read a .dat file: def read(filename): f = open(filename, 'r') l = f.readlines() a = [line.strip().split()[0::1] for line in l] b = [x for x in a][::8] return b # Format: Hotel_ID Overall_Rating Value_Rating Room_Rating Location_Rating Cleanliness_Rating Check_in/front_desk_Rating Service_Rating Business_Service_Rating ratings = read('Vector_shLDA_1999.dat') R = np.array(ratings) # Split into separate lists for each variable: overall = list(R[:,1]) value = list(R[:,2]) room = list(R[:,3]) location = list(R[:,4]) cleanliness = list(R[:,5]) CIFD = list(R[:,6]) service = list(R[:,7]) BS = list(R[:,8]) # The different "aspect" scores in floats: x1 = [float(i) for i in value] x2 = [float(i) for i in room] x3 = [float(i) for i in location] x4 = [float(i) for i in cleanliness] x5 = [float(i) for i in CIFD] x6 = [float(i) for i in service] x7 = [float(i) for i in BS] # The Overall scores in floats: y = [float(i) for i in overall] print "Overall: " + str(sum(y)/len(y)) print "Value: " + str(sum(x1)/len(x1)) print "Room: " + str(sum(x2)/len(x2)) print "Location: " + str(sum(x3)/len(x3)) print "Cleanliness: " + str(sum(x4)/len(x4)) print "Check In/Front Desk: " + str(sum(x5)/len(x5)) print "Service: " + str(sum(x6)/len(x6)) print "Business Service: " + str(sum(x7)/len(x7)) # Average scores:
Overall: 3.8025290973 Value: 3.80120999459 Room: 3.77717380541 Location: 4.14716877297 Cleanliness: 4.06411162162 Check In/Front Desk: 3.96843198919 Service: 3.90220738378 Business Service: 3.59470883784
# Using least squares regression: # slope, intercept, r, p, std [m, b, r, p, stdErr] = stats.linregress(x1,y) fit_fn = np.poly1d([m,b]) plt.plot(x1,y, 'yo', x1, fit_fn(x1), '--k') plt.axis([1, 5, 1, 5]) plt.title('Value Rating') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # Value Rating:
[<matplotlib.lines.Line2D object at 0x7f805589cf90>, <matplotlib.lines.Line2D object at 0x7f805589ca50>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f80529f8690>
m: 1.05918860128 b: -0.223669200045 correlation coefficient: 0.930025335386 R squared: 0.86494712446 Standard Error: 0.00973597506888
# slope, intercept, r, p, std [m, b, r, p, stdErr] = stats.linregress(x2,y) fit_fn = np.poly1d([m,b]) plt.plot(x2,y, 'bo', x2, fit_fn(x2), '--k') plt.axis([1, 5, 1, 5]) plt.title('Room Rating') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # Room Rating:
[<matplotlib.lines.Line2D object at 0x7f8054943590>, <matplotlib.lines.Line2D object at 0x7f80549438d0>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f8052b8f9d0>
m: 0.893059900952 b: 0.429286632762 correlation coefficient: 0.927807670299 R squared: 0.860827073066 Standard Error: 0.0083531259173
# slope, intercept, r, p, std [m, b, r, p, stdErr] = stats.linregress(x3,y) fit_fn = np.poly1d([m,b]) plt.plot(x3,y, 'ro', x3, fit_fn(x3), '--k') plt.axis([1, 5, 1, 5]) plt.title('Location Rating') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # Location Rating
[<matplotlib.lines.Line2D object at 0x7f8052e93fd0>, <matplotlib.lines.Line2D object at 0x7f8052ea3250>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f8051717b10>
m: 0.652408591183 b: 1.09688056072 correlation coefficient: 0.581282868434 R squared: 0.337889773135 Standard Error: 0.0212444901235
# slope, intercept, r, p, std [m, b, r, p, stdErr] = stats.linregress(x4,y) fit_fn = np.poly1d([m,b]) plt.plot(x4,y, 'co', x4, fit_fn(x4), '--k') plt.axis([1, 5, 1, 5]) plt.title('Cleanliness Rating') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # Cleanliness Rating
[<matplotlib.lines.Line2D object at 0x7f805398fa90>, <matplotlib.lines.Line2D object at 0x7f8054385d50>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f8051da1810>
m: 0.970128684613 b: -0.140182164309 correlation coefficient: 0.939728339211 R squared: 0.883089351516 Standard Error: 0.00821113274576
# slope, intercept, r, p, std [m, b, r, p, stdErr] = stats.linregress(x5,y) fit_fn = np.poly1d([m,b]) plt.plot(x5,y, 'mo', x5, fit_fn(x5), '--k') plt.axis([1, 5, 1, 5]) plt.title('Check in/ Front Desk Rating') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # Check in/ Front Desk Rating:
[<matplotlib.lines.Line2D object at 0x7f8051d94990>, <matplotlib.lines.Line2D object at 0x7f8051d943d0>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f8054e97550>
m: 1.03230413441 b: -0.294099652274 correlation coefficient: 0.878673457424 R squared: 0.772067044782 Standard Error: 0.0130476631572
# slope, intercept, r, p, std [m, b, r, p, stdErr] = stats.linregress(x6,y) fit_fn = np.poly1d([m,b]) plt.plot(x6,y, 'go', x6, fit_fn(x6), '--k') plt.axis([1, 5, 1, 5]) plt.title('Service Rating') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # Service Rating:
[<matplotlib.lines.Line2D object at 0x7f80558d6cd0>, <matplotlib.lines.Line2D object at 0x7f8051701350>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f80558f89d0>
m: 1.02056811588 b: -0.179939340144 correlation coefficient: 0.931313188561 R squared: 0.867344255189 Standard Error: 0.00928449508731
# slope, intercept, r, p, std [m, b, r, p, stdErr] = stats.linregress(x7,y) fit_fn = np.poly1d([m,b]) plt.plot(x7,y, 'wo', x7, fit_fn(x7), '--k') plt.axis([1, 5, 1, 5]) plt.title('Business Service Rating') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # Business Service Rating:
[<matplotlib.lines.Line2D object at 0x7f805398f290>, <matplotlib.lines.Line2D object at 0x7f805398ffd0>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f8055773310>
m: 0.821229375572 b: 0.850448603036 correlation coefficient: 0.790014863251 R squared: 0.624123484157 Standard Error: 0.0148252090647
# Summary of the R-squared Scores: ''' Value = 0.86494712446 Room = 0.860827073066 Location = 0.337889773135 Cleanliness = 0.883089351516 Check In/Front Desk = 0.772067044782 Service = 0.867344255189 Business Service = 0.624123484157 ''' # Location is the smallest contributor to the overall scores, # while cleanliness, value, room, and service ratings are big contributors. # Interestingly enough, location also happens to have the highest average score. ︠ca3fa413-10d5-4071-ac05-2ae142dbe4b5︠ # For multivariate regression: x = [x1,x2,x3,x4,x5,x6,x7] def reg_m(y, x): ones = np.ones(len(x[0])) X = sm.add_constant(np.column_stack((x[0], ones))) for ele in x[1:]: X = sm.add_constant(np.column_stack((ele, X))) results = sm.OLS(y, X).fit() return results print reg_m(y, x).summary()
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.956 Model: OLS Adj. R-squared: 0.955 Method: Least Squares F-statistic: 5651. Date: Thu, 08 Oct 2015 Prob (F-statistic): 0.00 Time: 17:19:44 Log-Likelihood: 1045.2 No. Observations: 1850 AIC: -2074. Df Residuals: 1842 BIC: -2030. Df Model: 7 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [95.0% Conf. Int.] ------------------------------------------------------------------------------ x1 0.0076 0.009 0.848 0.396 -0.010 0.025 x2 0.2732 0.018 14.913 0.000 0.237 0.309 x3 -0.0082 0.016 -0.509 0.611 -0.040 0.023 x4 0.1742 0.018 9.953 0.000 0.140 0.209 x5 0.1004 0.007 15.085 0.000 0.087 0.113 x6 0.2593 0.014 17.903 0.000 0.231 0.288 x7 0.3222 0.014 23.465 0.000 0.295 0.349 const -0.5871 0.027 -21.553 0.000 -0.640 -0.534 ============================================================================== Omnibus: 83.226 Durbin-Watson: 1.996 Prob(Omnibus): 0.000 Jarque-Bera (JB): 271.006 Skew: -0.043 Prob(JB): 1.42e-59 Kurtosis: 4.873 Cond. No. 94.2 ============================================================================== Warnings: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Some experimentation with correlating some aspect scores with each other: # slope, intercept, r, p, std [m, b, r, p, stdErr] = stats.linregress(x3,x1) fit_fn = np.poly1d([m,b]) plt.plot(x3,x1, 'wo', x3, fit_fn(x3), '--k') plt.axis([1, 5, 1, 5]) plt.title('Value (y) / Location (x)') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # Is location correlated with value?
[<matplotlib.lines.Line2D object at 0x7f8051821410>, <matplotlib.lines.Line2D object at 0x7f80518217d0>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f80549f0390>
m: 0.506589791298 b: 1.70029663142 correlation coefficient: 0.51404697135 R squared: 0.264244288754 Standard Error: 0.0196638913779
[m, b, r, p, stdErr] = stats.linregress(x4,x2) fit_fn = np.poly1d([m,b]) plt.plot(x4,x2, 'wo', x4, fit_fn(x4), '--k') plt.axis([1, 5, 1, 5]) plt.title('Room (y) / Cleanliness (x)') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # Should room be correlated with cleanliness?
[<matplotlib.lines.Line2D object at 0x7f8054601ed0>, <matplotlib.lines.Line2D object at 0x7f805460da90>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f8054742790>
m: 1.00811865289 b: -0.319932927773 correlation coefficient: 0.939955426758 R squared: 0.883516204292 Standard Error: 0.00851502964697
[m, b, r, p, stdErr] = stats.linregress(x6,x5) fit_fn = np.poly1d([m,b]) plt.plot(x6,x5, 'wo', x6, fit_fn(x6), '--k') plt.axis([1, 5, 1, 5]) plt.title('Check In/Front Desk (y) / Service (x)') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # Is Check In/Front Desk correlated with service?
[<matplotlib.lines.Line2D object at 0x7f8052c8a650>, <matplotlib.lines.Line2D object at 0x7f8052c8a890>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f8054756c10>
m: 0.866869091498 b: 0.585729019571 correlation coefficient: 0.929367399954 R squared: 0.863723764098 Standard Error: 0.00800986442343
[m, b, r, p, stdErr] = stats.linregress(x6,x7) fit_fn = np.poly1d([m,b]) plt.plot(x6,x7, 'wo', x6, fit_fn(x6), '--k') plt.axis([1, 5, 1, 5]) plt.title('Business service (y) / Service (x)') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # How about business service with service?
[<matplotlib.lines.Line2D object at 0x7f80506bf310>, <matplotlib.lines.Line2D object at 0x7f80506bf550>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f805443b7d0>
m: 0.818479222835 b: 0.400833171016 correlation coefficient: 0.776409123186 R squared: 0.602811126566 Standard Error: 0.0154548424779
[m, b, r, p, stdErr] = stats.linregress(x7,x3) fit_fn = np.poly1d([m,b]) plt.plot(x7,x3, 'wo', x7, fit_fn(x7), '--k') plt.axis([1, 5, 1, 5]) plt.title('Location (y) / Business service (x)') plt.show() print print "m: " + str(m) print "b: " + str(b) print "correlation coefficient: " + str(r) print "R squared: " + str(r**2) print "Standard Error: " + str(stdErr) print # How about business service with location? (shouldn't be very correlated)
[<matplotlib.lines.Line2D object at 0x7f8051a08710>, <matplotlib.lines.Line2D object at 0x7f8050b33a90>] [1, 5, 1, 5] <matplotlib.text.Text object at 0x7f8054968190>
m: 0.393586437611 b: 2.73234012724 correlation coefficient: 0.424955113006 R squared: 0.18058684807 Standard Error: 0.0195028260679