40 · 线性与逻辑回归:从预测到二分类
🔗 知识图谱导航:阅读本文前,建议先回顾《39 · scikit-learn 实战:预处理、特征选择与交叉验证》中的
StandardScaler、train_test_split和Pipeline思路;本文会把这些工具放进“预测连续值”和“判断二分类”的真实脚本里。
运行环境:
pip install numpy scikit-learn。本文不下载外部数据,所有训练数据都由make_regression()和make_classification()在本地生成。
痛点与架构:回归不是背公式,分类也不是只调
fit()。这篇文章要解决两个最常见问题:一是“给我几个特征,预测一个连续数字”;二是“给我一组样本,判断它属于 0 还是 1”。我们会先手写线性回归理解梯度下降,再用 sklearn 做逻辑回归和决策边界。
回归与分类先分清
线性回归:输入特征 -> 输出连续值
例子:面积、楼层、地铁距离 -> 房价
逻辑回归:输入特征 -> 输出概率 -> 转成 0/1 类别
例子:访问时长、点击次数、历史行为 -> 是否转化
极客解析:线性回归像“用尺子量价格”,逻辑回归像“给风险打分再盖章”。前者关心误差有多大,后者关心判断对不对、漏判多不多。
步步为营:核心逻辑自适应拆解
这一篇按 6 个台阶走:先学会打印指标,再手写梯度下降,接着跑完整线性回归、逻辑回归、ASCII 决策边界,最后用 argparse 做终端遥控器。每段演示都能独立运行,并且会立刻打印结果。
Step 1:用 print_table 把模型指标排成新手能读懂的表格
痛点与机制:
print_table 是这篇文章的“成绩单排版器”。机器学习会产出 MAE、MSE、准确率、F1 等一堆指标,如果散着打印,新手很容易迷路。它像 Excel 表格,把每个指标塞进固定格子里,让你一眼看出模型表现。
核心源码(逐字来自文末完整源码):
def print_table(headers: list, rows: list, title: str = "") -> None:
if title:
print(f"\n{'='*60}\n {title}\n{'='*60}")
col_widths = [max(len(str(h)), max((len(str(r[i])) for r in rows), default=0))
for i, h in enumerate(headers)]
print(f"┌{'┬'.join('─'*(w+2) for w in col_widths)}┐")
print(f"│{'│'.join(f' {str(h):<{w}} ' for h, w in zip(headers, col_widths))}│")
print(f"├{'┼'.join('─'*(w+2) for w in col_widths)}┤")
for row in rows:
print(f"│{'│'.join(f' {str(v):<{w}} ' for v, w in zip(row, col_widths))}│")
print(f"└{'┴'.join('─'*(w+2) for w in col_widths)}┘")
可运行演示(补齐 Mock 数据与 print 反馈):
def print_table(headers: list, rows: list, title: str = "") -> None:
if title:
print(f"\n{'='*60}\n {title}\n{'='*60}")
col_widths = [max(len(str(h)), max((len(str(r[i])) for r in rows), default=0))
for i, h in enumerate(headers)]
print(f"┌{'┬'.join('─'*(w+2) for w in col_widths)}┐")
print(f"│{'│'.join(f' {str(h):<{w}} ' for h, w in zip(headers, col_widths))}│")
print(f"├{'┼'.join('─'*(w+2) for w in col_widths)}┤")
for row in rows:
print(f"│{'│'.join(f' {str(v):<{w}} ' for v, w in zip(row, col_widths))}│")
print(f"└{'┴'.join('─'*(w+2) for w in col_widths)}┘")
print_table(
["模型", "用途", "一句话"],
[
["线性回归", "预测连续值", "估房价、销量、温度"],
["逻辑回归", "预测二分类", "判断是/否、通过/不通过"],
],
"回归任务入门地图",
)
Step 2:用 LinearRegressionGD 手写梯度下降学习权重
痛点与机制:
线性回归可以理解成“找一把最合适的尺子”:每个特征有一个权重 w,整体再加一个偏置 b。梯度下降像下山,每轮都根据误差往更低的损失方向挪一点;学习率 lr 就是步子大小,太大会摔跤,太小会走很久。
核心源码(逐字来自文末完整源码):
class LinearRegressionGD:
def __init__(self, lr: float = 0.01, epochs: int = 1000):
self.lr = lr
self.epochs = epochs
self.w: np.ndarray = np.array([])
self.b: float = 0.0
self.losses: list = []
def fit(self, X: np.ndarray, y: np.ndarray) -> "LinearRegressionGD":
n, d = X.shape
self.w = np.zeros(d)
self.b = 0.0
for _ in range(self.epochs):
y_hat = X @ self.w + self.b
err = y_hat - y
self.w -= self.lr * (2 / n) * (X.T @ err)
self.b -= self.lr * (2 / n) * err.sum()
self.losses.append(float(np.mean(err ** 2)))
return self
def predict(self, X: np.ndarray) -> np.ndarray:
return X @ self.w + self.b
可运行演示(补齐 Mock 数据与 print 反馈):
import numpy as np
class LinearRegressionGD:
def __init__(self, lr: float = 0.01, epochs: int = 1000):
self.lr = lr
self.epochs = epochs
self.w: np.ndarray = np.array([])
self.b: float = 0.0
self.losses: list = []
def fit(self, X: np.ndarray, y: np.ndarray) -> "LinearRegressionGD":
n, d = X.shape
self.w = np.zeros(d)
self.b = 0.0
for _ in range(self.epochs):
y_hat = X @ self.w + self.b
err = y_hat - y
self.w -= self.lr * (2 / n) * (X.T @ err)
self.b -= self.lr * (2 / n) * err.sum()
self.losses.append(float(np.mean(err ** 2)))
return self
def predict(self, X: np.ndarray) -> np.ndarray:
return X @ self.w + self.b
# Mock 数据:假设房价 = 3 * 面积 + 2 * 离地铁近程度 + 5。
X = np.array([[1.0, 1.0], [2.0, 1.0], [3.0, 2.0], [4.0, 3.0]])
y = np.array([10.0, 13.0, 18.0, 23.0])
model = LinearRegressionGD(lr=0.03, epochs=600).fit(X, y)
print("学到的权重 w:", np.round(model.w, 3).tolist())
print("学到的偏置 b:", round(model.b, 3))
print("预测 [5, 3] 的价格:", round(float(model.predict(np.array([[5.0, 3.0]]))[0]), 2))
print("损失从", round(model.losses[0], 2), "降到", round(model.losses[-1], 4))
Step 3:用 mode_linear 跑完整连续值预测闭环
痛点与机制:
mode_linear 把造数据、切训练集、标准化、训练、预测和评估串起来。它像一次完整考试:训练集是练习题,测试集是正式卷,MAE/MSE/R² 是阅卷结果。损失曲线如果一路下降,说明模型真的在学习。
核心源码(逐字来自文末完整源码):
def mode_linear() -> None:
print(f"[{nexdo_time()}] 梯度下降线性回归")
X, y = make_regression(n_samples=500, n_features=5, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
model = LinearRegressionGD(lr=0.05, epochs=500)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# 损失曲线(ASCII)
print("\n 训练损失曲线(每100轮)")
losses_sampled = model.losses[::100]
max_loss = max(losses_sampled)
height = 8
for h in range(height, 0, -1):
line = f" {h*max_loss/height:8.1f} │"
for l in losses_sampled:
line += "█" if l >= h * max_loss / height else " "
print(line)
print(f" └{'─'*len(losses_sampled)}")
print(f" 轮次(×100)")
print_table(
["指标", "值"],
[["MAE", f"{mae:.4f}"], ["MSE", f"{mse:.4f}"],
["RMSE", f"{math.sqrt(mse):.4f}"], ["R²", f"{r2:.4f}"]],
"线性回归评估指标"
)
可运行演示(补齐 Mock 数据与 print 反馈):
import argparse
import time
import math
from typing import Tuple
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def nexdo_time() -> str:
return time.strftime("%Y-%m-%d %H:%M:%S")
def print_table(headers: list, rows: list, title: str = "") -> None:
if title:
print(f"\n{'='*60}\n {title}\n{'='*60}")
col_widths = [max(len(str(h)), max((len(str(r[i])) for r in rows), default=0))
for i, h in enumerate(headers)]
print(f"┌{'┬'.join('─'*(w+2) for w in col_widths)}┐")
print(f"│{'│'.join(f' {str(h):<{w}} ' for h, w in zip(headers, col_widths))}│")
print(f"├{'┼'.join('─'*(w+2) for w in col_widths)}┤")
for row in rows:
print(f"│{'│'.join(f' {str(v):<{w}} ' for v, w in zip(row, col_widths))}│")
print(f"└{'┴'.join('─'*(w+2) for w in col_widths)}┘")
class LinearRegressionGD:
def __init__(self, lr: float = 0.01, epochs: int = 1000):
self.lr = lr
self.epochs = epochs
self.w: np.ndarray = np.array([])
self.b: float = 0.0
self.losses: list = []
def fit(self, X: np.ndarray, y: np.ndarray) -> "LinearRegressionGD":
n, d = X.shape
self.w = np.zeros(d)
self.b = 0.0
for _ in range(self.epochs):
y_hat = X @ self.w + self.b
err = y_hat - y
self.w -= self.lr * (2 / n) * (X.T @ err)
self.b -= self.lr * (2 / n) * err.sum()
self.losses.append(float(np.mean(err ** 2)))
return self
def predict(self, X: np.ndarray) -> np.ndarray:
return X @ self.w + self.b
def mode_linear() -> None:
print(f"[{nexdo_time()}] 梯度下降线性回归")
X, y = make_regression(n_samples=500, n_features=5, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
model = LinearRegressionGD(lr=0.05, epochs=500)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# 损失曲线(ASCII)
print("\n 训练损失曲线(每100轮)")
losses_sampled = model.losses[::100]
max_loss = max(losses_sampled)
height = 8
for h in range(height, 0, -1):
line = f" {h*max_loss/height:8.1f} │"
for l in losses_sampled:
line += "█" if l >= h * max_loss / height else " "
print(line)
print(f" └{'─'*len(losses_sampled)}")
print(f" 轮次(×100)")
print_table(
["指标", "值"],
[["MAE", f"{mae:.4f}"], ["MSE", f"{mse:.4f}"],
["RMSE", f"{math.sqrt(mse):.4f}"], ["R²", f"{r2:.4f}"]],
"线性回归评估指标"
)
# 直接跑线性回归模式,观察损失曲线和 MAE/MSE/R2。
mode_linear()
Step 4:用 mode_logistic 对比不同 C 值的二分类效果
痛点与机制:
逻辑回归虽然叫“回归”,实际常用来做二分类。它先算一个分数,再用 sigmoid 压成 0 到 1 的概率。C 是正则化强度的倒数:C 小,管得严,模型更保守;C 大,管得松,模型更愿意贴合训练数据。
核心源码(逐字来自文末完整源码):
def mode_logistic() -> None:
print(f"[{nexdo_time()}] sklearn 逻辑回归")
X, y = make_classification(n_samples=1000, n_features=10, n_informative=6, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
rows = []
for name, clf in [
("LR(C=0.01)", LogisticRegression(C=0.01, max_iter=500, random_state=42)),
("LR(C=1.0)", LogisticRegression(C=1.0, max_iter=500, random_state=42)),
("LR(C=100)", LogisticRegression(C=100, max_iter=500, random_state=42)),
("Ridge(LR)", LogisticRegression(C=1.0, max_iter=500, random_state=42)),
]:
clf.fit(X_train_s, y_train)
yp = clf.predict(X_test_s)
rows.append([name,
f"{accuracy_score(y_test, yp):.4f}",
f"{precision_score(y_test, yp):.4f}",
f"{recall_score(y_test, yp):.4f}",
f"{f1_score(y_test, yp):.4f}"])
print_table(["模型", "准确率", "精确率", "召回率", "F1"], rows, "逻辑回归正则化对比")
可运行演示(补齐 Mock 数据与 print 反馈):
import time
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def nexdo_time() -> str:
return time.strftime("%Y-%m-%d %H:%M:%S")
def print_table(headers: list, rows: list, title: str = "") -> None:
if title:
print(f"\n{'='*60}\n {title}\n{'='*60}")
col_widths = [max(len(str(h)), max((len(str(r[i])) for r in rows), default=0))
for i, h in enumerate(headers)]
print(f"┌{'┬'.join('─'*(w+2) for w in col_widths)}┐")
print(f"│{'│'.join(f' {str(h):<{w}} ' for h, w in zip(headers, col_widths))}│")
print(f"├{'┼'.join('─'*(w+2) for w in col_widths)}┤")
for row in rows:
print(f"│{'│'.join(f' {str(v):<{w}} ' for v, w in zip(row, col_widths))}│")
print(f"└{'┴'.join('─'*(w+2) for w in col_widths)}┘")
def mode_logistic() -> None:
print(f"[{nexdo_time()}] sklearn 逻辑回归")
X, y = make_classification(n_samples=1000, n_features=10, n_informative=6, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
rows = []
for name, clf in [
("LR(C=0.01)", LogisticRegression(C=0.01, max_iter=500, random_state=42)),
("LR(C=1.0)", LogisticRegression(C=1.0, max_iter=500, random_state=42)),
("LR(C=100)", LogisticRegression(C=100, max_iter=500, random_state=42)),
("Ridge(LR)", LogisticRegression(C=1.0, max_iter=500, random_state=42)),
]:
clf.fit(X_train_s, y_train)
yp = clf.predict(X_test_s)
rows.append([name,
f"{accuracy_score(y_test, yp):.4f}",
f"{precision_score(y_test, yp):.4f}",
f"{recall_score(y_test, yp):.4f}",
f"{f1_score(y_test, yp):.4f}"])
print_table(["模型", "准确率", "精确率", "召回率", "F1"], rows, "逻辑回归正则化对比")
# 直接跑逻辑回归模式,看不同 C 值如何影响分类指标。
mode_logistic()
Step 5:用 mode_boundary 把分类器的判断区域画成 ASCII 地图
痛点与机制:
二维决策边界就像城市地图的分区线:模型把平面分成“更像类 0”和“更像类 1”的区域。· 和 + 是模型预测区域,O 和 X 是真实样本点;如果样本大多落在对应区域,说明分类器学得还不错。
核心源码(逐字来自文末完整源码):
def mode_boundary() -> None:
"""ASCII 决策边界(2特征)"""
print(f"[{nexdo_time()}] ASCII 决策边界")
X, y = make_classification(n_samples=200, n_features=2, n_informative=2,
n_redundant=0, random_state=42)
scaler = StandardScaler()
X_s = scaler.fit_transform(X)
clf = LogisticRegression(max_iter=500, random_state=42)
clf.fit(X_s, y)
# 在 [-3,3]×[-3,3] 网格上绘制
width, height = 60, 20
x_min, x_max = -3.0, 3.0
y_min, y_max = -3.0, 3.0
grid = []
for row in range(height):
line = ""
yv = y_max - (y_max - y_min) * row / height
for col in range(width):
xv = x_min + (x_max - x_min) * col / width
prob = clf.predict_proba([[xv, yv]])[0][1]
line += "·" if prob < 0.5 else "+"
grid.append(line)
# 叠加真实样本点
for xi, yi, label in zip(X_s[:, 0], X_s[:, 1], y):
col = int((xi - x_min) / (x_max - x_min) * width)
row = int((y_max - yi) / (y_max - y_min) * height)
if 0 <= row < height and 0 <= col < width:
ch = "O" if label == 0 else "X"
grid[row] = grid[row][:col] + ch + grid[row][col+1:]
print("\n ASCII 决策边界(O=类0, X=类1, ·=预测0区域, +=预测1区域)")
print(f" {'─'*width}")
for line in grid:
print(f" {line}")
print(f" {'─'*width}")
可运行演示(补齐 Mock 数据与 print 反馈):
import time
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
def nexdo_time() -> str:
return time.strftime("%Y-%m-%d %H:%M:%S")
def mode_boundary() -> None:
"""ASCII 决策边界(2特征)"""
print(f"[{nexdo_time()}] ASCII 决策边界")
X, y = make_classification(n_samples=200, n_features=2, n_informative=2,
n_redundant=0, random_state=42)
scaler = StandardScaler()
X_s = scaler.fit_transform(X)
clf = LogisticRegression(max_iter=500, random_state=42)
clf.fit(X_s, y)
# 在 [-3,3]×[-3,3] 网格上绘制
width, height = 60, 20
x_min, x_max = -3.0, 3.0
y_min, y_max = -3.0, 3.0
grid = []
for row in range(height):
line = ""
yv = y_max - (y_max - y_min) * row / height
for col in range(width):
xv = x_min + (x_max - x_min) * col / width
prob = clf.predict_proba([[xv, yv]])[0][1]
line += "·" if prob < 0.5 else "+"
grid.append(line)
# 叠加真实样本点
for xi, yi, label in zip(X_s[:, 0], X_s[:, 1], y):
col = int((xi - x_min) / (x_max - x_min) * width)
row = int((y_max - yi) / (y_max - y_min) * height)
if 0 <= row < height and 0 <= col < width:
ch = "O" if label == 0 else "X"
grid[row] = grid[row][:col] + ch + grid[row][col+1:]
print("\n ASCII 决策边界(O=类0, X=类1, ·=预测0区域, +=预测1区域)")
print(f" {'─'*width}")
for line in grid:
print(f" {line}")
print(f" {'─'*width}")
# 决策边界像地图分区:左边/右边分别代表模型判断的两个类别。
mode_boundary()
Step 6:用 main 做四种运行模式的命令行遥控器
痛点与机制:
main 是脚本遥控器。用户不用改源码,只要在终端切换 --mode linear/logistic/boundary/compare,就能分别观察线性回归、逻辑回归、决策边界和完整对比。对新手来说,这比交互式输入更稳定,也更接近真实工程脚本。
核心源码(逐字来自文末完整源码):
def main() -> None:
parser = argparse.ArgumentParser(description="线性与逻辑回归演示")
parser.add_argument("--mode", choices=["linear", "logistic", "boundary", "compare"],
default="compare")
args = parser.parse_args()
{"linear": mode_linear, "logistic": mode_logistic,
"boundary": mode_boundary, "compare": mode_compare}[args.mode]()
可运行演示(补齐 Mock 数据与 print 反馈):
import argparse
import sys
def mode_linear() -> None:
print("linear:预测连续数值,比如房价、销量、温度")
def mode_logistic() -> None:
print("logistic:预测二分类,比如通过/不通过、是/否")
def mode_boundary() -> None:
print("boundary:把模型的分类区域画成 ASCII 地图")
def mode_compare() -> None:
mode_linear()
mode_logistic()
mode_boundary()
def main() -> None:
parser = argparse.ArgumentParser(description="线性与逻辑回归演示")
parser.add_argument("--mode", choices=["linear", "logistic", "boundary", "compare"],
default="compare")
args = parser.parse_args()
{"linear": mode_linear, "logistic": mode_logistic,
"boundary": mode_boundary, "compare": mode_compare}[args.mode]()
# 模拟用户在终端输入不同参数;真实使用时只需要改 --mode。
for mode in ["linear", "logistic", "boundary", "compare"]:
print(f"\n$ python3 40-regression.py --mode {mode}")
sys.argv = ["prog", "--mode", mode]
main()
极客实战:完整源码与运行
现在,把上面的积木拼起来,将以下完整代码放进你的编辑器,运行它。先跑 --mode compare 看全流程,再单独切换 linear、logistic、boundary 深挖每一块。
#!/usr/bin/env python3
"""
40-regression.py
numpy手动实现梯度下降线性回归 + sklearn逻辑回归 + ASCII决策边界
用法:
python 40-regression.py --mode linear
python 40-regression.py --mode logistic
python 40-regression.py --mode boundary
python 40-regression.py --mode compare
"""
import argparse
import time
import math
from typing import Tuple
import numpy as np
from sklearn.datasets import make_classification, make_regression
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
f1_score, mean_absolute_error, mean_squared_error, r2_score)
def nexdo_time() -> str:
return time.strftime("%Y-%m-%d %H:%M:%S")
def print_table(headers: list, rows: list, title: str = "") -> None:
if title:
print(f"\n{'='*60}\n {title}\n{'='*60}")
col_widths = [max(len(str(h)), max((len(str(r[i])) for r in rows), default=0))
for i, h in enumerate(headers)]
print(f"┌{'┬'.join('─'*(w+2) for w in col_widths)}┐")
print(f"│{'│'.join(f' {str(h):<{w}} ' for h, w in zip(headers, col_widths))}│")
print(f"├{'┼'.join('─'*(w+2) for w in col_widths)}┤")
for row in rows:
print(f"│{'│'.join(f' {str(v):<{w}} ' for v, w in zip(row, col_widths))}│")
print(f"└{'┴'.join('─'*(w+2) for w in col_widths)}┘")
# ── numpy 手动梯度下降线性回归 ──────────────────────────────────────────────
class LinearRegressionGD:
def __init__(self, lr: float = 0.01, epochs: int = 1000):
self.lr = lr
self.epochs = epochs
self.w: np.ndarray = np.array([])
self.b: float = 0.0
self.losses: list = []
def fit(self, X: np.ndarray, y: np.ndarray) -> "LinearRegressionGD":
n, d = X.shape
self.w = np.zeros(d)
self.b = 0.0
for _ in range(self.epochs):
y_hat = X @ self.w + self.b
err = y_hat - y
self.w -= self.lr * (2 / n) * (X.T @ err)
self.b -= self.lr * (2 / n) * err.sum()
self.losses.append(float(np.mean(err ** 2)))
return self
def predict(self, X: np.ndarray) -> np.ndarray:
return X @ self.w + self.b
def mode_linear() -> None:
print(f"[{nexdo_time()}] 梯度下降线性回归")
X, y = make_regression(n_samples=500, n_features=5, noise=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
model = LinearRegressionGD(lr=0.05, epochs=500)
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# 损失曲线(ASCII)
print("\n 训练损失曲线(每100轮)")
losses_sampled = model.losses[::100]
max_loss = max(losses_sampled)
height = 8
for h in range(height, 0, -1):
line = f" {h*max_loss/height:8.1f} │"
for l in losses_sampled:
line += "█" if l >= h * max_loss / height else " "
print(line)
print(f" └{'─'*len(losses_sampled)}")
print(f" 轮次(×100)")
print_table(
["指标", "值"],
[["MAE", f"{mae:.4f}"], ["MSE", f"{mse:.4f}"],
["RMSE", f"{math.sqrt(mse):.4f}"], ["R²", f"{r2:.4f}"]],
"线性回归评估指标"
)
def mode_logistic() -> None:
print(f"[{nexdo_time()}] sklearn 逻辑回归")
X, y = make_classification(n_samples=1000, n_features=10, n_informative=6, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
rows = []
for name, clf in [
("LR(C=0.01)", LogisticRegression(C=0.01, max_iter=500, random_state=42)),
("LR(C=1.0)", LogisticRegression(C=1.0, max_iter=500, random_state=42)),
("LR(C=100)", LogisticRegression(C=100, max_iter=500, random_state=42)),
("Ridge(LR)", LogisticRegression(C=1.0, max_iter=500, random_state=42)),
]:
clf.fit(X_train_s, y_train)
yp = clf.predict(X_test_s)
rows.append([name,
f"{accuracy_score(y_test, yp):.4f}",
f"{precision_score(y_test, yp):.4f}",
f"{recall_score(y_test, yp):.4f}",
f"{f1_score(y_test, yp):.4f}"])
print_table(["模型", "准确率", "精确率", "召回率", "F1"], rows, "逻辑回归正则化对比")
def mode_boundary() -> None:
"""ASCII 决策边界(2特征)"""
print(f"[{nexdo_time()}] ASCII 决策边界")
X, y = make_classification(n_samples=200, n_features=2, n_informative=2,
n_redundant=0, random_state=42)
scaler = StandardScaler()
X_s = scaler.fit_transform(X)
clf = LogisticRegression(max_iter=500, random_state=42)
clf.fit(X_s, y)
# 在 [-3,3]×[-3,3] 网格上绘制
width, height = 60, 20
x_min, x_max = -3.0, 3.0
y_min, y_max = -3.0, 3.0
grid = []
for row in range(height):
line = ""
yv = y_max - (y_max - y_min) * row / height
for col in range(width):
xv = x_min + (x_max - x_min) * col / width
prob = clf.predict_proba([[xv, yv]])[0][1]
line += "·" if prob < 0.5 else "+"
grid.append(line)
# 叠加真实样本点
for xi, yi, label in zip(X_s[:, 0], X_s[:, 1], y):
col = int((xi - x_min) / (x_max - x_min) * width)
row = int((y_max - yi) / (y_max - y_min) * height)
if 0 <= row < height and 0 <= col < width:
ch = "O" if label == 0 else "X"
grid[row] = grid[row][:col] + ch + grid[row][col+1:]
print("\n ASCII 决策边界(O=类0, X=类1, ·=预测0区域, +=预测1区域)")
print(f" {'─'*width}")
for line in grid:
print(f" {line}")
print(f" {'─'*width}")
def mode_compare() -> None:
mode_linear()
mode_logistic()
mode_boundary()
def main() -> None:
parser = argparse.ArgumentParser(description="线性与逻辑回归演示")
parser.add_argument("--mode", choices=["linear", "logistic", "boundary", "compare"],
default="compare")
args = parser.parse_args()
{"linear": mode_linear, "logistic": mode_logistic,
"boundary": mode_boundary, "compare": mode_compare}[args.mode]()
if __name__ == "__main__":
main()
$ python3 40-regression.py --mode linear
[2026-04-18 10:44:21] 梯度下降线性回归
训练损失曲线(每100轮)
13530.9 │█
11839.5 │█
10148.2 │█
8456.8 │█
6765.4 │█
5074.1 │█
3382.7 │█
1691.4 │█
└─────
轮次(×100)
============================================================
线性回归评估指标
============================================================
┌──────┬──────────┐
│ 指标 │ 值 │
├──────┼──────────┤
│ MAE │ 8.4038 │
│ MSE │ 106.3250 │
$ python3 40-regression.py --mode logistic
[2026-04-18 10:44:22] sklearn 逻辑回归
============================================================
逻辑回归正则化对比
============================================================
┌────────────┬────────┬────────┬────────┬────────┐
│ 模型 │ 准确率 │ 精确率 │ 召回率 │ F1 │
├────────────┼────────┼────────┼────────┼────────┤
│ LR(C=0.01) │ 0.8050 │ 0.7105 │ 0.9310 │ 0.8060 │
│ LR(C=1.0) │ 0.8000 │ 0.7156 │ 0.8966 │ 0.7959 │
│ LR(C=100) │ 0.8000 │ 0.7156 │ 0.8966 │ 0.7959 │
│ Ridge(LR) │ 0.8000 │ 0.7156 │ 0.8966 │ 0.7959 │
└────────────┴────────┴────────┴────────┴────────┘
小结
| 模块 | 你要记住什么 |
|---|---|
LinearRegressionGD |
用梯度下降一点点修正权重和偏置,让预测误差变小 |
mode_linear |
完成连续值预测闭环:造数据、标准化、训练、预测、评估 |
mode_logistic |
完成二分类评估:准确率、精确率、召回率、F1 不能只看一个 |
mode_boundary |
把分类器的判断区域画出来,帮助你理解“模型边界” |
main |
用 argparse 做脚本遥控器,真实工程里比交互式输入更稳 |
⏱ NexDo Time(5 分钟)
挑战:把 mode_logistic() 里的 C 值列表改成 [0.001, 0.01, 0.1, 1, 10],重新运行并观察 F1 分数变化。
Don’t wait for next time, do it in the next moment.