In [1]:
Copied!
import numpy as np
import pandas as pd
df = pd.read_csv('data/melon3.0a.csv')
X_rcol, y_rcol = df.columns[1:-2], df.columns[-2] # columns used for regression
X_ccol, y_ccol = df.columns[1:-1], df.columns[-1] # columns used for classification
value_map = {
'色泽': {'浅白': 0, '青绿': 1, '乌黑': 2},
'根蒂': {'蜷缩': 0, '稍蜷': 1, '硬挺': 2},
'敲声': {'沉闷': 0, '浊响': 1, '清脆': 2},
'纹理': {'模糊': 0, '稍糊': 1, '清晰': 2},
'脐部': {'凹陷': 0, '稍凹': 1, '平坦': 2},
'触感': {'硬滑': 0, '软粘': 1},
'好瓜': {'是': 1, '否': 0},
}
for col in value_map:
df[col] = df[col].map(value_map[col])
X = np.concatenate([df[X_rcol].values, np.ones((df.shape[0], 1))], axis=1)
y = df[y_rcol].values.reshape(-1, 1)
def linear_regression(X, y):
return np.linalg.inv(X.T @ X) @ X.T @ y
linear_regression(X, y).T
import numpy as np
import pandas as pd
df = pd.read_csv('data/melon3.0a.csv')
X_rcol, y_rcol = df.columns[1:-2], df.columns[-2] # columns used for regression
X_ccol, y_ccol = df.columns[1:-1], df.columns[-1] # columns used for classification
value_map = {
'色泽': {'浅白': 0, '青绿': 1, '乌黑': 2},
'根蒂': {'蜷缩': 0, '稍蜷': 1, '硬挺': 2},
'敲声': {'沉闷': 0, '浊响': 1, '清脆': 2},
'纹理': {'模糊': 0, '稍糊': 1, '清晰': 2},
'脐部': {'凹陷': 0, '稍凹': 1, '平坦': 2},
'触感': {'硬滑': 0, '软粘': 1},
'好瓜': {'是': 1, '否': 0},
}
for col in value_map:
df[col] = df[col].map(value_map[col])
X = np.concatenate([df[X_rcol].values, np.ones((df.shape[0], 1))], axis=1)
y = df[y_rcol].values.reshape(-1, 1)
def linear_regression(X, y):
return np.linalg.inv(X.T @ X) @ X.T @ y
linear_regression(X, y).T
/tmp/ipykernel_107409/2537742158.py:2: DeprecationWarning: Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0), (to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries) but was not found to be installed on your system. If this would cause problems for you, please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466 import pandas as pd
Out[1]:
array([[-0.00561581, -0.02008279, 0.03507281, 0.10539452, -0.03439565, 0.05235698, 0.09502766, 0.02259885]])
损失函数为均方误差
In [2]:
Copied!
def mse_loss(X, y, w):
return np.mean((X @ w - y) ** 2)
mse_loss(X, y, linear_regression(X, y))
def mse_loss(X, y, w):
return np.mean((X @ w - y) ** 2)
mse_loss(X, y, linear_regression(X, y))
Out[2]:
0.003631662892364951
当样本矩阵非满秩时,存在多个满足训练集的模型,此时可以在优化目标中加入正则化项,如L2-norm即加入权重的平方之和。然后使用梯度下降等数值方式进行计算。
In [3]:
Copied!
class SGD():
def __init__(self, d, lr=0.01, epochs=1000):
self.d = d
self.lr = lr
self.epochs = epochs
def __call__(self, X, y):
w = np.random.normal(0, 1, size=(X.shape[1], 1))
for _ in range(self.epochs):
w -= self.lr * self.d(X, y, w)
return w
optim_l2 = SGD(lambda X, y, w: X.T @ (X @ w - y) + 0.1 * w)
w = optim_l2(X, y)
print(w.T, mse_loss(X, y, w))
class SGD():
def __init__(self, d, lr=0.01, epochs=1000):
self.d = d
self.lr = lr
self.epochs = epochs
def __call__(self, X, y):
w = np.random.normal(0, 1, size=(X.shape[1], 1))
for _ in range(self.epochs):
w -= self.lr * self.d(X, y, w)
return w
optim_l2 = SGD(lambda X, y, w: X.T @ (X @ w - y) + 0.1 * w)
w = optim_l2(X, y)
print(w.T, mse_loss(X, y, w))
[[ 0.0052212 -0.03642245 0.01139193 0.09204791 -0.05482619 0.02187988 -0.18066372 0.2319513 ]] 0.0038789880547393696
对率回归可以将线性模型应用到二分类问题上。对率回归需要用到logit函数将连续的回归值映射到 $(0, 1)$ 上
$$ f(x) = \frac{1}{1 + e^{-x}} $$
In [4]:
Copied!
X = df[X_ccol].values
y = df[y_ccol].values.reshape(-1, 1)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def precision(X, y, w):
return np.mean((sigmoid(X @ w) > 0.5) == y)
X = df[X_ccol].values
y = df[y_ccol].values.reshape(-1, 1)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def precision(X, y, w):
return np.mean((sigmoid(X @ w) > 0.5) == y)
使用极大似然法可以得到对率回归的优化目标函数
$$ l(\boldsymbol w) = \sum_{i=1}^m \left(-\boldsymbol y_i\boldsymbol \beta^\top \boldsymbol x_i + \ln \left(1 + e^{\boldsymbol \beta^\top \boldsymbol x_i}\right) \right) $$
In [5]:
Copied!
optim_logit = SGD(lambda X, y, w: X.T @ (sigmoid(X @ w) - y))
w = optim_logit(X, y)
print(w.T, precision(X, y, w))
optim_logit = SGD(lambda X, y, w: X.T @ (sigmoid(X @ w) - y))
w = optim_logit(X, y)
print(w.T, precision(X, y, w))
[[ 1.07920029 -2.75255144 1.16538144 1.86133974 -2.04621827 0.9005823 -2.53920491 -2.2742112 ]] 0.8823529411764706
LDA¶
LDA是线性判别分析的简称,属于分类算法。该算法的核心思路为将样本点投影到 $n$ 维空间的平面上,通过选择平面,最小化同一类别内样本点投影的距离,同时最大化不同类别样本点投影的距离。
In [6]:
Copied!
def cov(X, a, b):
return np.mean((X[:, a] - np.mean(X[:, a])) * (X[:, b] - np.mean(X[:, b])))
def cov_matrix(X):
return np.array([
[
cov(X, i, j)
for i in range(X.shape[1])
]
for j in range(X.shape[1])
])
sw = cov_matrix(X[y[:, 0] == 0]) + cov_matrix(X[y[:, 0] == 1])
mu0, mu1 = np.mean(X[y[:, 0] == 0], axis=0), np.mean(X[y[:, 0] == 1], axis=0)
w = np.linalg.inv(sw) @ (mu0 - mu1).reshape(-1, 1)
c0, c1 = w.T @ mu0, w.T @ mu1
def precision(X, y, w):
return np.mean((X @ w < (c0 + c1) / 2) == y)
precision(X, y, w)
def cov(X, a, b):
return np.mean((X[:, a] - np.mean(X[:, a])) * (X[:, b] - np.mean(X[:, b])))
def cov_matrix(X):
return np.array([
[
cov(X, i, j)
for i in range(X.shape[1])
]
for j in range(X.shape[1])
])
sw = cov_matrix(X[y[:, 0] == 0]) + cov_matrix(X[y[:, 0] == 1])
mu0, mu1 = np.mean(X[y[:, 0] == 0], axis=0), np.mean(X[y[:, 0] == 1], axis=0)
w = np.linalg.inv(sw) @ (mu0 - mu1).reshape(-1, 1)
c0, c1 = w.T @ mu0, w.T @ mu1
def precision(X, y, w):
return np.mean((X @ w < (c0 + c1) / 2) == y)
precision(X, y, w)
Out[6]:
0.8823529411764706