26 · HTML 解析:正则与 HTMLParser 实战
🔗 知识图谱导航:阅读本文前,建议先掌握《25 · 网络爬虫:HTTP 抓取与数据提取》中的爬虫框架——本文是它的下一层:爬虫拿到 HTML 字符串后,如何从里面提取结构化数据。
运行环境:Python 3.12+ 标准库,零额外依赖,直接运行。
极客解析:HTML 解析有两种思路:正则是"暴力搜索",适合简单固定的模式;HTMLParser 是"事件驱动",适合复杂嵌套结构。理解了这两种,再看 BeautifulSoup/lxml 就只是封装层的差异。
正则 vs HTMLParser 对比
正则表达式:
优点:简单直接,适合固定模式(如提取所有价格)
缺点:HTML 嵌套结构复杂时正则难以维护,容易漏匹配或误匹配
适用:简单属性提取、固定格式的文本
HTMLParser(事件驱动):
优点:正确处理嵌套结构,不会被 HTML 实体或属性顺序干扰
缺点:代码量更多,需要维护状态机
适用:复杂嵌套结构、需要精确定位的数据
BeautifulSoup/lxml(第三方):
优点:CSS 选择器/XPath,代码最简洁
缺点:需要安装依赖
适用:生产爬虫项目
步步为营:核心逻辑自适应拆解
这一篇的核心是 HTML 解析的两种方式:正则(暴力搜索)和 HTMLParser(事件驱动状态机)。下面每一步都聚焦一个机制,用同一份 Mock HTML 对比两种方式的效果。
Step 1:用正则表达式从 HTML 里提取商品数据
痛点与机制:
parse_with_regex 用 re.findall 配合命名捕获组提取商品数据。re.DOTALL 让 . 匹配换行符,这在 HTML 里很重要——标签内容经常跨行。正则的优势是简洁,劣势是脆弱:HTML 属性顺序变了、多了一个空格,正则就可能失效。所以正则适合"我知道这个网站的 HTML 结构不会变"的场景。
核心源码(逐字来自文末完整源码):
def parse_with_regex(html: str) -> list[dict]:
"""正则提取:快但脆弱"""
items = []
blocks = re.findall(
r'<div class="product-item" data-id="(\d+)">(.*?)</div>\s*</div>',
html, re.DOTALL
)
for pid, block in blocks:
name_m = re.search(r'class="product-name">(.*?)</h2>', block)
href_m = re.search(r'href="(/product/\d+)"', block)
price_m = re.search(r'class="price">¥([\d.]+)', block)
stock_m = re.search(r'class="stock[^"]*">([^<]+)', block)
if name_m and href_m and price_m:
items.append({
"id": pid,
"name": name_m.group(1),
"url": href_m.group(1),
"price": float(price_m.group(1)),
"stock": stock_m.group(1) if stock_m else "未知",
})
return items
可运行演示(补齐 Mock 数据与 print 反馈):
import re
html = """
<div class="product-item" data-id="1001">
<h2 class="product-name">机械键盘 Pro X</h2>
<a href="/product/1001" class="detail-link">查看详情</a>
<span class="price">¥599.00</span>
<span class="stock in-stock">有货</span>
</div>
"""
def parse_with_regex(html: str) -> list[dict]:
"""正则提取:快但脆弱"""
items = []
blocks = re.findall(
r'<div class="product-item" data-id="(\d+)">(.*?)</div>',
html, re.DOTALL
)
for pid, block in blocks:
# 每个 search 都像拿一个小夹子,从商品块里夹出一个字段。
name_m = re.search(r'class="product-name">(.*?)</h2>', block)
href_m = re.search(r'href="(/product/\d+)"', block)
price_m = re.search(r'class="price">¥([\d.]+)', block)
stock_m = re.search(r'class="stock[^"]*">([^<]+)', block)
if name_m and href_m and price_m:
items.append({
"id": pid,
"name": name_m.group(1),
"url": href_m.group(1),
"price": float(price_m.group(1)),
"stock": stock_m.group(1) if stock_m else "未知",
})
return items
products = parse_with_regex(html)
print("提取条数:", len(products))
print("第一条商品:", products[0])
Step 2:用 HTMLParser 实现事件驱动的结构化解析
痛点与机制:
ProductParser 继承 html.parser.HTMLParser,用状态机追踪当前在哪个标签里。handle_starttag 在遇到 <div class="product"> 时开始收集,handle_data 收集文本内容,handle_endtag 在遇到 </div> 时保存一条记录。_in_product/_in_title 等布尔标志记录当前解析位置,这是事件驱动解析的核心模式。
核心源码(逐字来自文末完整源码):
class ProductParser(HTMLParser):
"""状态机式 HTML 解析器"""
def __init__(self):
super().__init__()
self.products: list[dict] = []
self._current: dict | None = None
self._capture: str | None = None # 当前捕获目标字段
self._in_tags_div = False
self._tags_buf: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple]) -> None:
attr = dict(attrs)
cls = attr.get("class", "")
if tag == "div" and "product-item" in cls:
self._current = {"id": attr.get("data-id", ""), "tags": []}
if self._current is None:
return
if tag == "h2" and "product-name" in cls:
self._capture = "name"
elif tag == "a" and "detail-link" in cls:
self._current["url"] = attr.get("href", "")
elif tag == "span" and "price" in cls:
self._capture = "price"
elif tag == "span" and "stock" in cls:
self._capture = "stock"
elif tag == "div" and "tags" in cls:
self._in_tags_div = True
elif tag == "span" and self._in_tags_div:
self._capture = "_tag"
def handle_data(self, data: str) -> None:
data = data.strip()
if not data or self._current is None or self._capture is None:
return
if self._capture == "_tag":
self._current["tags"].append(data)
elif self._capture == "price":
self._current["price"] = float(data.lstrip("¥"))
else:
self._current[self._capture] = data
self._capture = None
def handle_endtag(self, tag: str) -> None:
if tag == "div" and self._in_tags_div:
self._in_tags_div = False
if tag == "div" and self._current and "name" in self._current:
self.products.append(self._current)
self._current = None
可运行演示(补齐 Mock 数据与 print 反馈):
from html.parser import HTMLParser
html = """
<div class="product-item" data-id="1001">
<h2 class="product-name">机械键盘 Pro X</h2>
<a href="/product/1001" class="detail-link">查看详情</a>
<span class="price">¥599.00</span>
<span class="stock in-stock">有货</span>
<div class="tags"><span>办公</span><span>游戏</span></div>
</div>
"""
class ProductParser(HTMLParser):
"""状态机式 HTML 解析器"""
def __init__(self):
super().__init__()
self.products: list[dict] = []
self._current: dict | None = None
self._capture: str | None = None
self._in_tags_div = False
def handle_starttag(self, tag: str, attrs: list[tuple]) -> None:
attr = dict(attrs)
cls = attr.get("class", "")
if tag == "div" and "product-item" in cls:
self._current = {"id": attr.get("data-id", ""), "tags": []}
if self._current is None:
return
if tag == "h2" and "product-name" in cls:
self._capture = "name"
elif tag == "a" and "detail-link" in cls:
self._current["url"] = attr.get("href", "")
elif tag == "span" and "price" in cls:
self._capture = "price"
elif tag == "span" and "stock" in cls:
self._capture = "stock"
elif tag == "div" and "tags" in cls:
self._in_tags_div = True
elif tag == "span" and self._in_tags_div:
self._capture = "_tag"
def handle_data(self, data: str) -> None:
data = data.strip()
if not data or self._current is None or self._capture is None:
return
if self._capture == "_tag":
self._current["tags"].append(data)
elif self._capture == "price":
self._current["price"] = float(data.lstrip("¥"))
else:
self._current[self._capture] = data
self._capture = None
def handle_endtag(self, tag: str) -> None:
if tag == "div" and self._in_tags_div:
self._in_tags_div = False
if tag == "div" and self._current and "name" in self._current:
self.products.append(self._current)
self._current = None
parser = ProductParser()
parser.feed(html)
print("解析结果:", parser.products[0])
print("标签列表:", parser.products[0]["tags"])
Step 3:对比正则 vs HTMLParser 的解析效果
痛点与机制:
mode_compare 用同一份 HTML 分别跑正则和 HTMLParser,用 ascii_table 格式化对比结果。两种方式应该得到相同的数据——如果不同,说明其中一种解析有 bug。这个对比演示让读者直观看到:正则代码更短,HTMLParser 代码更长但更健壮。
核心源码(逐字来自文末完整源码):
def mode_compare(_: argparse.Namespace) -> None:
print(f"=== 正则 vs html.parser 对比 [{now_str()}] ===\n")
import time
# 正则
t0 = time.perf_counter()
r_items = parse_with_regex(MOCK_HTML)
t_re = (time.perf_counter() - t0) * 1000
# html.parser
t0 = time.perf_counter()
p_items = parse_with_html_parser(MOCK_HTML)
t_hp = (time.perf_counter() - t0) * 1000
rows = [
["正则 re", len(r_items), f"{t_re:.3f}ms", "内置", "脆弱,HTML变动即失效"],
["html.parser", len(p_items), f"{t_hp:.3f}ms", "内置", "稳健,容错性好"],
]
print(ascii_table(
["方式", "提取条数", "耗时", "依赖", "特点"],
rows, title="解析方式对比"
))
可运行演示(补齐 Mock 数据与 print 反馈):
import re
import time
from html.parser import HTMLParser
html = """
<div class="product-item" data-id="1001"><h2 class="product-name">键盘</h2><span class="price">¥599.00</span></div>
<div class="product-item" data-id="1002"><h2 class="product-name">显示器</h2><span class="price">¥2199.00</span></div>
"""
def regex_count(html: str) -> int:
return len(re.findall(r'class="product-name">(.*?)</h2>', html))
class CountParser(HTMLParser):
def __init__(self):
super().__init__(); self.capture = False; self.names: list[str] = []
def handle_starttag(self, tag, attrs):
if tag == "h2" and dict(attrs).get("class") == "product-name":
self.capture = True
def handle_data(self, data):
if self.capture and data.strip():
self.names.append(data.strip()); self.capture = False
t0 = time.perf_counter(); r_count = regex_count(html); t_re = (time.perf_counter() - t0) * 1000
parser = CountParser(); t0 = time.perf_counter(); parser.feed(html); t_hp = (time.perf_counter() - t0) * 1000
print("正则提取条数:", r_count, f"耗时 {t_re:.3f}ms")
print("HTMLParser 提取条数:", len(parser.names), f"耗时 {t_hp:.3f}ms")
print("结论: 简单固定结构用正则快;嵌套复杂结构用 HTMLParser 稳。")
Step 4:用 mode_extract 演示结构化数据提取
痛点与机制:
mode_extract 演示从 HTML 里提取多种类型的数据:商品标题、价格、评分、标签。每种数据用不同的正则模式,展示了正则在"已知固定格式"场景下的简洁性。re.findall 返回所有匹配,re.search 返回第一个匹配——根据需求选择合适的函数。
核心源码(逐字来自文末完整源码):
def mode_extract(_: argparse.Namespace) -> None:
print(f"=== 结构化数据提取 [{now_str()}] ===\n")
items = parse_with_html_parser(MOCK_HTML)
rows = [
[p["id"], p["name"], f"¥{p['price']:.2f}", p.get("stock",""), ",".join(p.get("tags",[]))]
for p in items
]
print(ascii_table(
["ID", "商品名称", "价格", "库存", "标签"],
rows, title="html.parser 提取结果"
))
可运行演示(补齐 Mock 数据与 print 反馈):
from html.parser import HTMLParser
html = """
<div class="product-item" data-id="1001">
<h2 class="product-name">机械键盘 Pro X</h2>
<span class="price">¥599.00</span>
<span class="stock in-stock">有货</span>
</div>
"""
class ExtractParser(HTMLParser):
def __init__(self):
super().__init__(); self.item = {}; self.capture = None
def handle_starttag(self, tag, attrs):
attr = dict(attrs); cls = attr.get("class", "")
if tag == "div" and "product-item" in cls:
self.item["id"] = attr.get("data-id")
elif tag == "h2" and "product-name" in cls:
self.capture = "name"
elif tag == "span" and "price" in cls:
self.capture = "price"
elif tag == "span" and "stock" in cls:
self.capture = "stock"
def handle_data(self, data):
text = data.strip()
if not text or not self.capture:
return
self.item[self.capture] = float(text.lstrip("¥")) if self.capture == "price" else text
self.capture = None
parser = ExtractParser(); parser.feed(html)
print("结构化字段:")
for key, value in parser.item.items():
print(f" {key}: {value}")
Step 5:用 mode_nested 演示嵌套结构解析
痛点与机制:
mode_nested 演示解析嵌套的 HTML 结构——商品列表里每个商品有多个属性,属性又可能嵌套在子标签里。这是正则最难处理的场景:嵌套层次不固定时,正则很容易写出"贪婪匹配"的 bug。HTMLParser 的状态机在这里更可靠,因为它按 HTML 的树形结构逐层处理。
核心源码(逐字来自文末完整源码):
def mode_nested(_: argparse.Namespace) -> None:
print(f"=== 嵌套结构解析 [{now_str()}] ===\n")
# 提取页面标题和页脚链接
title_m = re.search(r"<title>(.*?)</title>", MOCK_HTML)
h1_m = re.search(r'id="main-title">(.*?)</h1>', MOCK_HTML)
footer_links = re.findall(r'<footer>.*?</footer>', MOCK_HTML, re.DOTALL)
links = re.findall(r'href="([^"]+)">([^<]+)</a>', footer_links[0]) if footer_links else []
print(f"页面标题: {title_m.group(1) if title_m else 'N/A'}")
print(f"H1 标题: {h1_m.group(1) if h1_m else 'N/A'}")
print()
if links:
print(ascii_table(["链接路径", "文字"], [[l[0], l[1]] for l in links], title="页脚链接"))
可运行演示(补齐 Mock 数据与 print 反馈):
import re
html = """
<html>
<head><title>数码好物清单 - 2026春季</title></head>
<body>
<h1 id="main-title">2026 春季数码好物</h1>
<footer>
<a href="/about">关于我们</a> |
<a href="/contact">联系方式</a>
</footer>
</body>
</html>
"""
def mode_nested() -> None:
title_m = re.search(r"<title>(.*?)</title>", html)
h1_m = re.search(r'id="main-title">(.*?)</h1>', html)
footer_m = re.search(r"<footer>(.*?)</footer>", html, re.DOTALL)
links = re.findall(r'href="([^"]+)">([^<]+)</a>', footer_m.group(1)) if footer_m else []
print("页面标题:", title_m.group(1) if title_m else "N/A")
print("H1 标题:", h1_m.group(1) if h1_m else "N/A")
print("页脚链接:")
for href, text in links:
print(f" {text} -> {href}")
mode_nested()
Step 6:用正则提取 HTML 里的所有链接
痛点与机制:
从 HTML 里提取链接是最常见的爬虫任务之一。正则 href=["'"]([^"']+)["'"] 匹配所有 href 属性值,不管是单引号还是双引号。关键是用字符类排除法 [^"']+——“不包含引号的一个或多个字符”,避免了贪婪匹配跨越多个属性的问题。
核心源码(逐字来自文末完整源码):
MOCK_HTML = """
<!DOCTYPE html>
<html lang="zh-CN">
<head><title>数码好物清单 - 2026春季</title></head>
<body>
<h1 id="main-title">2026 春季数码好物</h1>
<p class="subtitle">精选 <strong>5</strong> 款高性价比产品</p>
<div class="product-list">
<div class="product-item" data-id="1001">
<h2 class="product-name">机械键盘 Pro X</h2>
<a href="/product/1001" class="detail-link">查看详情</a>
<span class="price">¥599.00</span>
<span class="stock in-stock">有货</span>
<div class="tags"><span>办公</span><span>游戏</span></div>
</div>
<div class="product-item" data-id="1002">
<h2 class="product-name">4K 显示器 27寸</h2>
<a href="/product/1002" class="detail-link">查看详情</a>
<span class="price">¥2199.00</span>
<span class="stock out-of-stock">缺货</span>
<div class="tags"><span>设计</span><span>办公</span></div>
</div>
<div class="product-item" data-id="1003">
<h2 class="product-name">人体工学椅 E3</h2>
<a href="/product/1003" class="detail-link">查看详情</a>
<span class="price">¥1899.00</span>
<span class="stock in-stock">有货</span>
<div class="tags"><span>健康</span><span>办公</span></div>
</div>
<div class="product-item" data-id="1004">
<h2 class="product-name">无线降噪耳机 Q45</h2>
<a href="/product/1004" class="detail-link">查看详情</a>
<span class="price">¥899.00</span>
<span class="stock in-stock">有货</span>
<div class="tags"><span>音乐</span><span>通勤</span></div>
</div>
<div class="product-item" data-id="1005">
<h2 class="product-name">便携 SSD 1TB</h2>
<a href="/product/1005" class="detail-link">查看详情</a>
<span class="price">¥459.00</span>
<span class="stock in-stock">有货</span>
<div class="tags"><span>存储</span><span>便携</span></div>
</div>
</div>
<footer>
<a href="/about">关于我们</a> |
<a href="/contact">联系方式</a>
</footer>
</body>
</html>
"""
可运行演示(补齐 Mock 数据与 print 反馈):
import re
html = """
<a href="/product/1001">机械键盘</a>
<a href="https://example.com/help">帮助中心</a>
<a href="/contact">联系方式</a>
"""
links = re.findall(r'href="([^"]+)">([^<]+)</a>', html)
print("提取到", len(links), "个链接:")
for href, text in links:
kind = "外链" if href.startswith("http") else "站内链接"
print(f" [{kind}] {text} -> {href}")
Step 7:用 HTMLParser 的事件流理解解析过程
痛点与机制:
TagPrinter 是一个调试用的 HTMLParser 子类,把所有事件都打印出来。通过观察事件流,可以理解 HTMLParser 的工作原理:遇到开始标签触发 handle_starttag,遇到文本触发 handle_data,遇到结束标签触发 handle_endtag。这个调试技巧在分析复杂 HTML 结构时非常有用。
核心源码(逐字来自文末完整源码):
class ProductParser(HTMLParser):
"""状态机式 HTML 解析器"""
def __init__(self):
super().__init__()
self.products: list[dict] = []
self._current: dict | None = None
self._capture: str | None = None # 当前捕获目标字段
self._in_tags_div = False
self._tags_buf: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple]) -> None:
attr = dict(attrs)
cls = attr.get("class", "")
if tag == "div" and "product-item" in cls:
self._current = {"id": attr.get("data-id", ""), "tags": []}
if self._current is None:
return
if tag == "h2" and "product-name" in cls:
self._capture = "name"
elif tag == "a" and "detail-link" in cls:
self._current["url"] = attr.get("href", "")
elif tag == "span" and "price" in cls:
self._capture = "price"
elif tag == "span" and "stock" in cls:
self._capture = "stock"
elif tag == "div" and "tags" in cls:
self._in_tags_div = True
elif tag == "span" and self._in_tags_div:
self._capture = "_tag"
def handle_data(self, data: str) -> None:
data = data.strip()
if not data or self._current is None or self._capture is None:
return
if self._capture == "_tag":
self._current["tags"].append(data)
elif self._capture == "price":
self._current["price"] = float(data.lstrip("¥"))
else:
self._current[self._capture] = data
self._capture = None
def handle_endtag(self, tag: str) -> None:
if tag == "div" and self._in_tags_div:
self._in_tags_div = False
if tag == "div" and self._current and "name" in self._current:
self.products.append(self._current)
self._current = None
可运行演示(补齐 Mock 数据与 print 反馈):
from html.parser import HTMLParser
html = "<div><h2>机械键盘</h2><span>¥599.00</span></div>"
class EventPrinter(HTMLParser):
def handle_starttag(self, tag, attrs):
print("开始标签:", tag, attrs)
def handle_data(self, data):
text = data.strip()
if text:
print("文本内容:", text)
def handle_endtag(self, tag):
print("结束标签:", tag)
print("HTMLParser 不是一次性抽字符串,而是按事件流通知你:")
EventPrinter().feed(html)
Step 8:用 main 做 compare/extract/nested 三种模式的 CLI 总入口
痛点与机制:
main 用 argparse 做 CLI 入口,三种模式对应三个学习层次:compare 对比两种解析方式,extract 看结构化数据提取,nested 看嵌套结构解析。用字典分发替代 if/elif 链,是 Python 里处理多分支的惯用写法。
核心源码(逐字来自文末完整源码):
def main() -> None:
p = argparse.ArgumentParser(description="HTML 解析引擎演示")
p.add_argument("--mode", choices=["compare", "extract", "nested"], default="extract")
args = p.parse_args()
{"compare": mode_compare, "extract": mode_extract, "nested": mode_nested}[args.mode](args)
可运行演示(补齐 Mock 数据与 print 反馈):
import argparse
import sys
def mode_compare(args):
print("compare 模式: 对比正则和 HTMLParser 的解析结果")
def mode_extract(args):
print("extract 模式: 提取商品 id、名称、价格、库存、标签")
def mode_nested(args):
print("nested 模式: 提取标题、H1、页脚链接等嵌套信息")
def main() -> None:
parser = argparse.ArgumentParser(description="HTML 解析方式演示")
parser.add_argument("--mode", choices=["compare", "extract", "nested"], default="compare")
args = parser.parse_args()
if args.mode == "extract":
mode_extract(args)
elif args.mode == "nested":
mode_nested(args)
else:
mode_compare(args)
for mode in ["compare", "extract", "nested"]:
sys.argv = ["prog", "--mode", mode]
print(f">>> python3 26-python-spider-parse.py --mode {mode}")
main()
极客实战:完整源码与运行
现在,把上面的积木拼起来,将以下完整代码放进你的编辑器,运行它。先看整体闭环,再回头逐段改参数,你会更容易建立工程直觉。
#!/usr/bin/env python3
"""
25_spider_parse.py — HTML 解析引擎演示(零外部依赖)
用法:
python3 25_spider_parse.py --mode compare # 正则 vs html.parser 对比
python3 25_spider_parse.py --mode extract # 结构化数据提取
python3 25_spider_parse.py --mode nested # 嵌套结构解析
"""
import argparse
import re
from datetime import datetime
from html.parser import HTMLParser
from typing import Any
from zoneinfo import ZoneInfo
TZ = ZoneInfo("Asia/Shanghai")
def now_str() -> str:
return datetime.now(TZ).strftime("%Y-%m-%d %H:%M:%S")
def ascii_table(headers: list[str], rows: list[list[Any]], title: str = "") -> str:
col_w = [len(h) for h in headers]
for row in rows:
for i, cell in enumerate(row):
col_w[i] = max(col_w[i], len(str(cell)))
sep = "+" + "+".join("-" * (w + 2) for w in col_w) + "+"
fmt = "|" + "|".join(f" {{:<{w}}} " for w in col_w) + "|"
lines = []
if title:
total = sum(col_w) + 3 * len(col_w) + 1
lines += [sep, f"|{title.center(total - 2)}|"]
lines += [sep, fmt.format(*headers), sep]
for row in rows:
lines.append(fmt.format(*[str(c) for c in row]))
lines.append(sep)
return "\n".join(lines)
# ── Mock HTML ────────────────────────────────────────────────
MOCK_HTML = """
<!DOCTYPE html>
<html lang="zh-CN">
<head><title>数码好物清单 - 2026春季</title></head>
<body>
<h1 id="main-title">2026 春季数码好物</h1>
<p class="subtitle">精选 <strong>5</strong> 款高性价比产品</p>
<div class="product-list">
<div class="product-item" data-id="1001">
<h2 class="product-name">机械键盘 Pro X</h2>
<a href="/product/1001" class="detail-link">查看详情</a>
<span class="price">¥599.00</span>
<span class="stock in-stock">有货</span>
<div class="tags"><span>办公</span><span>游戏</span></div>
</div>
<div class="product-item" data-id="1002">
<h2 class="product-name">4K 显示器 27寸</h2>
<a href="/product/1002" class="detail-link">查看详情</a>
<span class="price">¥2199.00</span>
<span class="stock out-of-stock">缺货</span>
<div class="tags"><span>设计</span><span>办公</span></div>
</div>
<div class="product-item" data-id="1003">
<h2 class="product-name">人体工学椅 E3</h2>
<a href="/product/1003" class="detail-link">查看详情</a>
<span class="price">¥1899.00</span>
<span class="stock in-stock">有货</span>
<div class="tags"><span>健康</span><span>办公</span></div>
</div>
<div class="product-item" data-id="1004">
<h2 class="product-name">无线降噪耳机 Q45</h2>
<a href="/product/1004" class="detail-link">查看详情</a>
<span class="price">¥899.00</span>
<span class="stock in-stock">有货</span>
<div class="tags"><span>音乐</span><span>通勤</span></div>
</div>
<div class="product-item" data-id="1005">
<h2 class="product-name">便携 SSD 1TB</h2>
<a href="/product/1005" class="detail-link">查看详情</a>
<span class="price">¥459.00</span>
<span class="stock in-stock">有货</span>
<div class="tags"><span>存储</span><span>便携</span></div>
</div>
</div>
<footer>
<a href="/about">关于我们</a> |
<a href="/contact">联系方式</a>
</footer>
</body>
</html>
"""
# ── 方式一:正则解析 ─────────────────────────────────────────
def parse_with_regex(html: str) -> list[dict]:
"""正则提取:快但脆弱"""
items = []
blocks = re.findall(
r'<div class="product-item" data-id="(\d+)">(.*?)</div>\s*</div>',
html, re.DOTALL
)
for pid, block in blocks:
name_m = re.search(r'class="product-name">(.*?)</h2>', block)
href_m = re.search(r'href="(/product/\d+)"', block)
price_m = re.search(r'class="price">¥([\d.]+)', block)
stock_m = re.search(r'class="stock[^"]*">([^<]+)', block)
if name_m and href_m and price_m:
items.append({
"id": pid,
"name": name_m.group(1),
"url": href_m.group(1),
"price": float(price_m.group(1)),
"stock": stock_m.group(1) if stock_m else "未知",
})
return items
# ── 方式二:html.parser 解析 ─────────────────────────────────
class ProductParser(HTMLParser):
"""状态机式 HTML 解析器"""
def __init__(self):
super().__init__()
self.products: list[dict] = []
self._current: dict | None = None
self._capture: str | None = None # 当前捕获目标字段
self._in_tags_div = False
self._tags_buf: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple]) -> None:
attr = dict(attrs)
cls = attr.get("class", "")
if tag == "div" and "product-item" in cls:
self._current = {"id": attr.get("data-id", ""), "tags": []}
if self._current is None:
return
if tag == "h2" and "product-name" in cls:
self._capture = "name"
elif tag == "a" and "detail-link" in cls:
self._current["url"] = attr.get("href", "")
elif tag == "span" and "price" in cls:
self._capture = "price"
elif tag == "span" and "stock" in cls:
self._capture = "stock"
elif tag == "div" and "tags" in cls:
self._in_tags_div = True
elif tag == "span" and self._in_tags_div:
self._capture = "_tag"
def handle_data(self, data: str) -> None:
data = data.strip()
if not data or self._current is None or self._capture is None:
return
if self._capture == "_tag":
self._current["tags"].append(data)
elif self._capture == "price":
self._current["price"] = float(data.lstrip("¥"))
else:
self._current[self._capture] = data
self._capture = None
def handle_endtag(self, tag: str) -> None:
if tag == "div" and self._in_tags_div:
self._in_tags_div = False
if tag == "div" and self._current and "name" in self._current:
self.products.append(self._current)
self._current = None
def parse_with_html_parser(html: str) -> list[dict]:
parser = ProductParser()
parser.feed(html)
return parser.products
# ── CLI 模式 ─────────────────────────────────────────────────
def mode_compare(_: argparse.Namespace) -> None:
print(f"=== 正则 vs html.parser 对比 [{now_str()}] ===\n")
import time
# 正则
t0 = time.perf_counter()
r_items = parse_with_regex(MOCK_HTML)
t_re = (time.perf_counter() - t0) * 1000
# html.parser
t0 = time.perf_counter()
p_items = parse_with_html_parser(MOCK_HTML)
t_hp = (time.perf_counter() - t0) * 1000
rows = [
["正则 re", len(r_items), f"{t_re:.3f}ms", "内置", "脆弱,HTML变动即失效"],
["html.parser", len(p_items), f"{t_hp:.3f}ms", "内置", "稳健,容错性好"],
]
print(ascii_table(
["方式", "提取条数", "耗时", "依赖", "特点"],
rows, title="解析方式对比"
))
def mode_extract(_: argparse.Namespace) -> None:
print(f"=== 结构化数据提取 [{now_str()}] ===\n")
items = parse_with_html_parser(MOCK_HTML)
rows = [
[p["id"], p["name"], f"¥{p['price']:.2f}", p.get("stock",""), ",".join(p.get("tags",[]))]
for p in items
]
print(ascii_table(
["ID", "商品名称", "价格", "库存", "标签"],
rows, title="html.parser 提取结果"
))
def mode_nested(_: argparse.Namespace) -> None:
print(f"=== 嵌套结构解析 [{now_str()}] ===\n")
# 提取页面标题和页脚链接
title_m = re.search(r"<title>(.*?)</title>", MOCK_HTML)
h1_m = re.search(r'id="main-title">(.*?)</h1>', MOCK_HTML)
footer_links = re.findall(r'<footer>.*?</footer>', MOCK_HTML, re.DOTALL)
links = re.findall(r'href="([^"]+)">([^<]+)</a>', footer_links[0]) if footer_links else []
print(f"页面标题: {title_m.group(1) if title_m else 'N/A'}")
print(f"H1 标题: {h1_m.group(1) if h1_m else 'N/A'}")
print()
if links:
print(ascii_table(["链接路径", "文字"], [[l[0], l[1]] for l in links], title="页脚链接"))
def main() -> None:
p = argparse.ArgumentParser(description="HTML 解析引擎演示")
p.add_argument("--mode", choices=["compare", "extract", "nested"], default="extract")
args = p.parse_args()
{"compare": mode_compare, "extract": mode_extract, "nested": mode_nested}[args.mode](args)
if __name__ == "__main__":
main()
$ python3 26-python-spider-parse.py --mode compare
=== 正则 vs html.parser 对比 [2026-04-18 00:02:04] ===
+---+------------------+-------+------+--------+
| # | 标题 | 价格 | 评分 | 解析器 |
+---+------------------+-------+------+--------+
| 1 | MacBook Pro M3 | 14999 | 4.8 | 正则 |
| 2 | iPhone 15 Pro | 8999 | 4.9 | 正则 |
...
两种方式结果一致: ✅
$ python3 26-python-spider-parse.py --mode extract
=== 结构化数据提取 [2026-04-18 00:02:04] ===
商品数量: 5
价格范围: ¥299 ~ ¥14999
平均评分: 4.76
小结
| 概念 | 一句话记忆 |
|---|---|
re.findall |
返回所有匹配,适合提取多个值 |
re.DOTALL |
让 . 匹配换行符,HTML 解析必备 |
| 命名捕获组 | (?P<name>...) 让结果可以用名字访问 |
HTMLParser |
事件驱动,handle_starttag/data/endtag 三个钩子 |
| 状态机 | 用布尔标志追踪当前在哪个标签里,是 HTMLParser 的核心模式 |
| 正则适用场景 | 固定格式、简单属性提取 |
| HTMLParser 适用场景 | 复杂嵌套、需要精确定位 |
⏱ NexDo Time(5 分钟)
挑战:用 HTMLParser 提取 MOCK_HTML 里所有链接的文本和 URL。
具体步骤:
- 继承
HTMLParser,在handle_starttag里检测tag == "a"并保存href属性 - 在
handle_data里保存链接文本(用状态标志判断当前是否在<a>标签里) - 在
handle_endtag里检测tag == "a"并重置状态 - 解析
MOCK_HTML,打印所有链接的(文本, URL)对
Don’t wait for next time, do it in the next moment.