Kernel: Python 3
In [1]:
import numpy as np import pandas as pd np.random.seed(12345) import matplotlib.pyplot as plt plt.rc('figure', figsize=(10, 6)) PREVIOUS_MAX_ROWS = pd.options.display.max_rows pd.options.display.max_columns = 20 pd.options.display.max_rows = 20 pd.options.display.max_colwidth = 80 np.set_printoptions(precision=4, suppress=True)
In [2]:
data = pd.DataFrame({ 'x0': [1, 2, 3, 4, 5], 'x1': [0.01, -0.01, 0.25, -4.1, 0.], 'y': [-1.5, 0., 3.6, 1.3, -2.]}) data data.columns data.to_numpy()
In [3]:
df2 = pd.DataFrame(data.to_numpy(), columns=['one', 'two', 'three']) df2
In [4]:
df3 = data.copy() df3['strings'] = ['a', 'b', 'c', 'd', 'e'] df3 df3.to_numpy()
In [5]:
model_cols = ['x0', 'x1'] data.loc[:, model_cols].to_numpy()
In [6]:
data['category'] = pd.Categorical(['a', 'b', 'a', 'a', 'b'], categories=['a', 'b']) data
In [7]:
dummies = pd.get_dummies(data.category, prefix='category', dtype=float) data_with_dummies = data.drop('category', axis=1).join(dummies) data_with_dummies
In [8]:
data = pd.DataFrame({ 'x0': [1, 2, 3, 4, 5], 'x1': [0.01, -0.01, 0.25, -4.1, 0.], 'y': [-1.5, 0., 3.6, 1.3, -2.]}) data import patsy y, X = patsy.dmatrices('y ~ x0 + x1', data)
In [9]:
y X
In [10]:
np.asarray(y) np.asarray(X)
In [11]:
patsy.dmatrices('y ~ x0 + x1 + 0', data)[1]
In [12]:
coef, resid, _, _ = np.linalg.lstsq(X, y, rcond=None)
In [13]:
coef coef = pd.Series(coef.squeeze(), index=X.design_info.column_names) coef
In [14]:
y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data) X
In [15]:
y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data) X
In [16]:
new_data = pd.DataFrame({ 'x0': [6, 7, 8, 9], 'x1': [3.1, -0.5, 0, 2.3], 'y': [1, 2, 3, 4]}) new_X = patsy.build_design_matrices([X.design_info], new_data) new_X
In [17]:
y, X = patsy.dmatrices('y ~ I(x0 + x1)', data) X
In [18]:
data = pd.DataFrame({ 'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'], 'key2': [0, 1, 0, 1, 0, 1, 0, 0], 'v1': [1, 2, 3, 4, 5, 6, 7, 8], 'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7] }) y, X = patsy.dmatrices('v2 ~ key1', data) X
In [19]:
y, X = patsy.dmatrices('v2 ~ key1 + 0', data) X
In [20]:
y, X = patsy.dmatrices('v2 ~ C(key2)', data) X
In [21]:
data['key2'] = data['key2'].map({0: 'zero', 1: 'one'}) data y, X = patsy.dmatrices('v2 ~ key1 + key2', data) X y, X = patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data) X
In [22]:
import statsmodels.api as sm import statsmodels.formula.api as smf
In [23]:
# To make the example reproducible rng = np.random.default_rng(seed=12345) def dnorm(mean, variance, size=1): if isinstance(size, int): size = size, return mean + np.sqrt(variance) * rng.standard_normal(*size) N = 100 X = np.c_[dnorm(0, 0.4, size=N), dnorm(0, 0.6, size=N), dnorm(0, 0.2, size=N)] eps = dnorm(0, 0.1, size=N) beta = [0.1, 0.3, 0.5] y = np.dot(X, beta) + eps
In [24]:
X[:5] y[:5]
In [25]:
X_model = sm.add_constant(X) X_model[:5]
In [26]:
model = sm.OLS(y, X)
In [27]:
results = model.fit() results.params
In [28]:
print(results.summary())
In [29]:
data = pd.DataFrame(X, columns=['col0', 'col1', 'col2']) data['y'] = y data[:5]
In [30]:
results = smf.ols('y ~ col0 + col1 + col2', data=data).fit() results.params results.tvalues
In [31]:
results.predict(data[:5])
In [32]:
init_x = 4 values = [init_x, init_x] N = 1000 b0 = 0.8 b1 = -0.4 noise = dnorm(0, 0.1, N) for i in range(N): new_x = values[-1] * b0 + values[-2] * b1 + noise[i] values.append(new_x)
In [33]:
from statsmodels.tsa.ar_model import AutoReg MAXLAGS = 5 model = AutoReg(values, MAXLAGS) results = model.fit()
In [34]:
results.params
In [35]:
train = pd.read_csv('datasets/titanic/train.csv') test = pd.read_csv('datasets/titanic/test.csv') train.head(4)
In [36]:
train.isna().sum() test.isna().sum()
In [37]:
impute_value = train['Age'].median() train['Age'] = train['Age'].fillna(impute_value) test['Age'] = test['Age'].fillna(impute_value)
In [38]:
train['IsFemale'] = (train['Sex'] == 'female').astype(int) test['IsFemale'] = (test['Sex'] == 'female').astype(int)
In [39]:
predictors = ['Pclass', 'IsFemale', 'Age'] X_train = train[predictors].to_numpy() X_test = test[predictors].to_numpy() y_train = train['Survived'].to_numpy() X_train[:5] y_train[:5]
In [40]:
from sklearn.linear_model import LogisticRegression model = LogisticRegression()
In [41]:
model.fit(X_train, y_train)
In [42]:
y_predict = model.predict(X_test) y_predict[:10]
In [43]:
from sklearn.linear_model import LogisticRegressionCV model_cv = LogisticRegressionCV(Cs=10) model_cv.fit(X_train, y_train)
In [44]:
from sklearn.model_selection import cross_val_score model = LogisticRegression(C=10) scores = cross_val_score(model, X_train, y_train, cv=4) scores
In [45]:
In [46]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS