In [ ]:
Copied!
import numpy as np
import pandas as pd
df = pd.read_csv('data/melon3.0a.csv')
X_rcol, y_rcol = df.columns[1:-2], df.columns[-2] # columns used for regression
X_ccol, y_ccol = df.columns[1:-1], df.columns[-1] # columns used for classification
value_map = {
'色泽': {'浅白': 0, '青绿': 1, '乌黑': 2},
'根蒂': {'蜷缩': 0, '稍蜷': 1, '硬挺': 2},
'敲声': {'沉闷': 0, '浊响': 1, '清脆': 2},
'纹理': {'模糊': 0, '稍糊': 1, '清晰': 2},
'脐部': {'凹陷': 0, '稍凹': 1, '平坦': 2},
'触感': {'硬滑': 0, '软粘': 1},
'好瓜': {'是': 1, '否': 0},
}
for col in value_map:
df[col] = df[col].map(value_map[col])
X = np.concatenate([df[X_rcol].values, np.ones((df.shape[0], 1))], axis=1)
y = df[y_rcol].values.reshape(-1, 1)
def linear_regression(X, y):
return np.linalg.inv(X.T @ X) @ X.T @ y
linear_regression(X, y).T
import numpy as np
import pandas as pd
df = pd.read_csv('data/melon3.0a.csv')
X_rcol, y_rcol = df.columns[1:-2], df.columns[-2] # columns used for regression
X_ccol, y_ccol = df.columns[1:-1], df.columns[-1] # columns used for classification
value_map = {
'色泽': {'浅白': 0, '青绿': 1, '乌黑': 2},
'根蒂': {'蜷缩': 0, '稍蜷': 1, '硬挺': 2},
'敲声': {'沉闷': 0, '浊响': 1, '清脆': 2},
'纹理': {'模糊': 0, '稍糊': 1, '清晰': 2},
'脐部': {'凹陷': 0, '稍凹': 1, '平坦': 2},
'触感': {'硬滑': 0, '软粘': 1},
'好瓜': {'是': 1, '否': 0},
}
for col in value_map:
df[col] = df[col].map(value_map[col])
X = np.concatenate([df[X_rcol].values, np.ones((df.shape[0], 1))], axis=1)
y = df[y_rcol].values.reshape(-1, 1)
def linear_regression(X, y):
return np.linalg.inv(X.T @ X) @ X.T @ y
linear_regression(X, y).T
损失函数为均方误差
In [ ]:
Copied!
def mse_loss(X, y, w):
return np.mean((X @ w - y) ** 2)
mse_loss(X, y, linear_regression(X, y))
def mse_loss(X, y, w):
return np.mean((X @ w - y) ** 2)
mse_loss(X, y, linear_regression(X, y))
当样本矩阵非满秩时,存在多个满足训练集的模型,此时可以在优化目标中加入正则化项,如L2-norm即加入权重的平方之和。然后使用梯度下降等数值方式进行计算。
In [ ]:
Copied!
class SGD():
def __init__(self, d, lr=0.01, epochs=1000):
self.d = d
self.lr = lr
self.epochs = epochs
def __call__(self, X, y):
w = np.random.normal(0, 1, size=(X.shape[1], 1))
for _ in range(self.epochs):
w -= self.lr * self.d(X, y, w)
return w
optim_l2 = SGD(lambda X, y, w: X.T @ (X @ w - y) + 0.1 * w)
w = optim_l2(X, y)
print(w.T, mse_loss(X, y, w))
class SGD():
def __init__(self, d, lr=0.01, epochs=1000):
self.d = d
self.lr = lr
self.epochs = epochs
def __call__(self, X, y):
w = np.random.normal(0, 1, size=(X.shape[1], 1))
for _ in range(self.epochs):
w -= self.lr * self.d(X, y, w)
return w
optim_l2 = SGD(lambda X, y, w: X.T @ (X @ w - y) + 0.1 * w)
w = optim_l2(X, y)
print(w.T, mse_loss(X, y, w))
对率回归可以将线性模型应用到二分类问题上。对率回归需要用到logit函数将连续的回归值映射到 $(0, 1)$ 上
$$ f(x) = \frac{1}{1 + e^{-x}} $$
In [ ]:
Copied!
X = df[X_ccol].values
y = df[y_ccol].values.reshape(-1, 1)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def precision(X, y, w):
return np.mean((sigmoid(X @ w) > 0.5) == y)
X = df[X_ccol].values
y = df[y_ccol].values.reshape(-1, 1)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def precision(X, y, w):
return np.mean((sigmoid(X @ w) > 0.5) == y)
使用极大似然法可以得到对率回归的优化目标函数
$$ l(\boldsymbol w) = \sum_{i=1}^m \left(-\boldsymbol y_i\boldsymbol \beta^\top \boldsymbol x_i + \ln \left(1 + e^{\boldsymbol \beta^\top \boldsymbol x_i}\right) \right) $$
In [ ]:
Copied!
optim_logit = SGD(lambda X, y, w: X.T @ (sigmoid(X @ w) - y))
w = optim_logit(X, y)
print(w.T, precision(X, y, w))
optim_logit = SGD(lambda X, y, w: X.T @ (sigmoid(X @ w) - y))
w = optim_logit(X, y)
print(w.T, precision(X, y, w))
LDA¶
LDA是线性判别分析的简称,属于分类算法。该算法的核心思路为将样本点投影到 $n$ 维空间的平面上,通过选择平面,最小化同一类别内样本点投影的距离,同时最大化不同类别样本点投影的距离。
In [ ]:
Copied!
def cov(X, a, b):
return np.mean((X[:, a] - np.mean(X[:, a])) * (X[:, b] - np.mean(X[:, b])))
def cov_matrix(X):
return np.array([
[
cov(X, i, j)
for i in range(X.shape[1])
]
for j in range(X.shape[1])
])
sw = cov_matrix(X[y[:, 0] == 0]) + cov_matrix(X[y[:, 0] == 1])
mu0, mu1 = np.mean(X[y[:, 0] == 0], axis=0), np.mean(X[y[:, 0] == 1], axis=0)
w = np.linalg.inv(sw) @ (mu0 - mu1).reshape(-1, 1)
c0, c1 = w.T @ mu0, w.T @ mu1
def precision(X, y, w):
return np.mean((X @ w < (c0 + c1) / 2) == y)
precision(X, y, w)
def cov(X, a, b):
return np.mean((X[:, a] - np.mean(X[:, a])) * (X[:, b] - np.mean(X[:, b])))
def cov_matrix(X):
return np.array([
[
cov(X, i, j)
for i in range(X.shape[1])
]
for j in range(X.shape[1])
])
sw = cov_matrix(X[y[:, 0] == 0]) + cov_matrix(X[y[:, 0] == 1])
mu0, mu1 = np.mean(X[y[:, 0] == 0], axis=0), np.mean(X[y[:, 0] == 1], axis=0)
w = np.linalg.inv(sw) @ (mu0 - mu1).reshape(-1, 1)
c0, c1 = w.T @ mu0, w.T @ mu1
def precision(X, y, w):
return np.mean((X @ w < (c0 + c1) / 2) == y)
precision(X, y, w)