9 · 文件系统与异常:构建健壮的 IO 流
🔗 知识图谱导航:阅读本文前,建议先掌握/回顾 《8 · 模块化构建:标准库与自定义包》 中的核心概念;本文会在这个基础上继续推进。 上一篇我们用面向对象封装了数据模型;本篇把这些对象落地到磁盘——学会读写文件、处理异常,才能构建真正健壮的 IO 流。
极客解析:先把数据流、控制流和模块边界跑通,再谈抽象;每段代码都围绕一个可执行 CLI 闭环展开。
痛点与架构:单独记 API 或概念很容易学完就忘;本文先锁定真实痛点,再把它拆成“输入数据 → 核心机制 → 可运行输出”三段闭环。掌握 pathlib/open/with 文件读写、try/except/finally 异常处理、JSON/CSV 序列化,并用 tempfile 构建零副作用的知识库文件扫描器。
1. pathlib:现代文件路径操作
from pathlib import Path
p = Path("/tmp/kb")
p.mkdir(parents=True, exist_ok=True) # 递归创建目录
doc = p / "readme.txt"
print(doc.suffix) # .txt
print(doc.stem) # readme
print(doc.parent) # /tmp/kb
pathlib.Path 把路径当对象,告别字符串拼接的脆弱。
2. open / with:安全读写文件
import tempfile
from pathlib import Path
with tempfile.TemporaryDirectory() as tmpdir:
doc = Path(tmpdir) / "note.txt"
# 写入
with open(doc, "w", encoding="utf-8") as f:
f.write("Hello, Knowledge Base!\n")
# 读取全部行
with open(doc, encoding="utf-8") as f:
lines = f.readlines()
print("文件路径:", doc)
print("读取行数:", len(lines))
print("第一行:", lines[0].strip())
with 语句保证文件句柄在任何情况下都会被关闭,即使中途抛出异常。
3. 异常处理:try / except / finally / 自定义异常
import tempfile
from pathlib import Path
class KBError(Exception):
"""知识库基础异常"""
class FileParseError(KBError):
def __init__(self, path: str, reason: str) -> None:
super().__init__(f"解析失败 [{path}]: {reason}")
self.path = path
def read_safe(path: Path) -> str:
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError as e:
raise FileParseError(str(path), f"编码错误: {e}") from e
except PermissionError as e:
raise FileParseError(str(path), f"无权限: {e}") from e
finally:
print(f"读取流程结束: {path.name}") # 清理资源(如有)放这里,必定执行
# 演示
with tempfile.TemporaryDirectory() as tmpdir:
ok_file = Path(tmpdir) / "ok.txt"
ok_file.write_text("文件内容正常", encoding="utf-8")
print("正常读取:", read_safe(ok_file))
missing_file = Path(tmpdir) / "missing.txt"
try:
print("读取缺失文件:", read_safe(missing_file))
except FileNotFoundError as e:
print("捕获到底层异常:", type(e).__name__)
print("自定义异常示例:", FileParseError("/tmp/test.txt", "格式错误"))
异常链(raise ... from e)保留原始堆栈,调试时一目了然。
4. JSON 读写
import json
import tempfile
from pathlib import Path
def save_json(data: dict, path: Path) -> None:
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def load_json(path: Path) -> dict:
with open(path, encoding="utf-8") as f:
return json.load(f)
with tempfile.TemporaryDirectory() as tmpdir:
path = Path(tmpdir) / "report.json"
save_json({"top_words": {"知识库": 3, "AI": 2}}, path)
loaded = load_json(path)
print("写入文件:", path.name)
print("读回数据:", loaded)
5. CSV 读写
import csv
import tempfile
from pathlib import Path
def save_csv(rows: list[dict], path: Path, fieldnames: list[str]) -> None:
with open(path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(rows)
def load_csv(path: Path) -> list[dict]:
with open(path, encoding="utf-8") as f:
return list(csv.DictReader(f))
with tempfile.TemporaryDirectory() as tmpdir:
path = Path(tmpdir) / "report.csv"
rows = [{"word": "知识库", "count": 3}, {"word": "AI", "count": 2}]
save_csv(rows, path, ["word", "count"])
print("写入文件:", path.name)
print("读回行数:", len(load_csv(path)))
print("第一行:", load_csv(path)[0])
6. tempfile:零副作用测试数据
import tempfile
from pathlib import Path
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
p = Path(tmpdir) / "test.txt"
p.write_text("临时数据", encoding="utf-8")
print(p.read_text(encoding="utf-8"))
# 退出 with 块后目录自动删除,不留任何痕迹
实战:知识库文件扫描器
下面是完整可运行代码,用 tempfile 创建测试数据,零副作用。
步步为营:核心逻辑自适应拆解
这一篇我们做一个“知识库文件扫描器”:先安全读取文件,再分词统计,最后输出 JSON/CSV 报告。你可以把流程想成整理书架:先准备书架,再逐本文档读取,抽关键词,统计频次,最后生成清单。
Step 1:先定义知识库专用异常,别让底层报错裸奔
痛点与机制:
异常不是为了吓人,而是为了把错误说清楚。KBError 是知识库错误的总类,FileParseError 是文件解析失败的具体类型。它像医院分诊牌:先知道是“知识库系统的问题”,再看具体是哪个文件、什么原因。
核心源码(逐字来自文末完整源码):
class KBError(Exception):
"""知识库基础异常"""
class FileParseError(KBError):
def __init__(self, path: str, reason: str) -> None:
super().__init__(f"解析失败 [{path}]: {reason}")
self.path = path
可运行演示(补齐 Mock 数据与 print 反馈):
# Step 1:自定义异常像“业务专用警报牌”,比裸露的系统错误更好懂。
class KBError(Exception):
"""知识库基础异常"""
class FileParseError(KBError):
def __init__(self, path: str, reason: str) -> None:
super().__init__(f"解析失败 [{path}]: {reason}")
self.path = path
try:
raise FileParseError("/tmp/bad.txt", "编码错误")
except KBError as exc:
print("捕获到知识库异常:", exc)
print("出错路径:", exc.path)
Step 2:用正则 tokenize 把文本拆成可统计的词
痛点与机制:
tokenize() 用正则从文本里捞词。[a-zA-Z]{2,} 抓英文词,[一-鿿]{2,} 抓连续中文词。你可以把它理解成一个筛子:单个符号和太短的碎片会漏掉,留下能统计的关键词。
核心源码(逐字来自文末完整源码):
def tokenize(text: str) -> list[str]:
"""简单分词:提取长度≥2的中英文词"""
return re.findall(r"[a-zA-Z]{2,}|[\u4e00-\u9fff]{2,}", text)
可运行演示(补齐 Mock 数据与 print 反馈):
import re
# Step 2:分词函数像筛子,只留下长度足够的中英文词。
def tokenize(text: str) -> list[str]:
"""简单分词:提取长度≥2的中英文词"""
return re.findall(r"[a-zA-Z]{2,}|[\u4e00-\u9fff]{2,}", text)
text = "AI pipeline 包括数据清洗、embedding生成、向量检索。Python 3 很适合做 IO。"
words = tokenize(text)
print("原文:", text)
print("分词结果:", words)
Step 3:用 read_safe 安全读取文件,并把底层异常翻译成人话
痛点与机制:
直接 path.read_text() 出错时,新手会看到很长的系统异常。read_safe() 把编码错误、权限错误翻译成 FileParseError,让上层知道“这是文件解析失败”,并保留原始异常链,方便排查。
核心源码(逐字来自文末完整源码):
def read_safe(path: Path) -> str:
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError as e:
raise FileParseError(str(path), f"编码错误: {e}") from e
except PermissionError as e:
raise FileParseError(str(path), f"无权限: {e}") from e
可运行演示(补齐 Mock 数据与 print 反馈):
from pathlib import Path
import tempfile
class KBError(Exception):
"""知识库基础异常"""
class FileParseError(KBError):
def __init__(self, path: str, reason: str) -> None:
super().__init__(f"解析失败 [{path}]: {reason}")
self.path = path
# Step 3:安全读取像戴手套拿玻璃杯,出错时给出清楚的业务错误。
def read_safe(path: Path) -> str:
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError as e:
raise FileParseError(str(path), f"编码错误: {e}") from e
except PermissionError as e:
raise FileParseError(str(path), f"无权限: {e}") from e
with tempfile.TemporaryDirectory() as tmpdir:
path = Path(tmpdir) / "note.txt"
path.write_text("知识库文件读取成功", encoding="utf-8")
print("读取结果:", read_safe(path))
Step 4:用 scan_kb 扫描目录,把多个文件汇总成词频表
痛点与机制:
scan_kb() 做三件事:找出 .txt/.md 文件,安全读取每个文件,调用 tokenize() 后用 Counter 计数。坏文件不会让整个扫描中断,而是被警告记录,这才是健壮 IO。
核心源码(逐字来自文末完整源码):
def scan_kb(kb_dir: Path) -> dict[str, int]:
"""扫描目录下所有 .txt/.md 文件,返回全局词频"""
counter: Counter[str] = Counter()
files = list(kb_dir.rglob("*.txt")) + list(kb_dir.rglob("*.md"))
print(f"发现文件: {len(files)} 个")
for f in files:
try:
text = read_safe(f)
words = tokenize(text)
counter.update(words)
except FileParseError as e:
print(f" [WARN] {e}")
return dict(counter.most_common(20))
可运行演示(补齐 Mock 数据与 print 反馈):
import re
import tempfile
from collections import Counter
from pathlib import Path
class KBError(Exception):
"""知识库基础异常"""
class FileParseError(KBError):
def __init__(self, path: str, reason: str) -> None:
super().__init__(f"解析失败 [{path}]: {reason}")
self.path = path
def tokenize(text: str) -> list[str]:
"""简单分词:提取长度≥2的中英文词"""
return re.findall(r"[a-zA-Z]{2,}|[\u4e00-\u9fff]{2,}", text)
def read_safe(path: Path) -> str:
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError as e:
raise FileParseError(str(path), f"编码错误: {e}") from e
except PermissionError as e:
raise FileParseError(str(path), f"无权限: {e}") from e
# Step 4:扫描器像图书管理员,逐本读文件,再把关键词计数汇总。
def scan_kb(kb_dir: Path) -> dict[str, int]:
"""扫描目录下所有 .txt/.md 文件,返回全局词频"""
counter: Counter[str] = Counter()
files = list(kb_dir.rglob("*.txt")) + list(kb_dir.rglob("*.md"))
print(f"发现文件: {len(files)} 个")
for f in files:
try:
text = read_safe(f)
words = tokenize(text)
counter.update(words)
except FileParseError as e:
print(f" [WARN] {e}")
return dict(counter.most_common(20))
with tempfile.TemporaryDirectory() as tmpdir:
kb = Path(tmpdir) / "kb"
kb.mkdir()
(kb / "a.md").write_text("知识库 AI 知识库", encoding="utf-8")
(kb / "b.txt").write_text("AI pipeline 数据清洗", encoding="utf-8")
print("词频:", scan_kb(kb))
Step 5:用 print_table 把词频结果排成终端表格
痛点与机制:
数据只存在字典里时,对人不友好。print_table() 把词和频次对齐输出,像把账单列成两栏:左边是什么词,右边出现几次。终端工具的体验很大程度取决于这种输出是否清楚。
核心源码(逐字来自文末完整源码):
def print_table(freq: dict[str, int]) -> None:
print(f"\n{'词语':<16} {'频次':>6}")
print("─" * 24)
for word, cnt in freq.items():
print(f"{word:<16} {cnt:>6}")
可运行演示(补齐 Mock 数据与 print 反馈):
# Step 5:表格输出像把散乱账单排整齐,新手一眼能读懂结果。
def print_table(freq: dict[str, int]) -> None:
print(f"\n{'词语':<16} {'频次':>6}")
print("─" * 24)
for word, cnt in freq.items():
print(f"{word:<16} {cnt:>6}")
freq = {"知识库": 5, "AI": 3, "pipeline": 2}
print_table(freq)
Step 6:用 write_report 同时生成 JSON 和 CSV 两种报告
痛点与机制:
write_report() 同时写 JSON 和 CSV。JSON 更适合程序继续读取,CSV 更适合用 Excel 或 Numbers 打开。这里用 with open(...) 保证文件写完后自动关闭,避免数据还没落盘就退出。
核心源码(逐字来自文末完整源码):
def write_report(freq: dict[str, int], out_dir: Path) -> None:
# JSON
json_path = out_dir / "report.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump({"top_words": freq}, f, ensure_ascii=False, indent=2)
print(f"JSON 报告: {json_path}")
# CSV
csv_path = out_dir / "report.csv"
rows = [{"word": w, "count": c} for w, c in freq.items()]
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["word", "count"])
writer.writeheader()
writer.writerows(rows)
print(f"CSV 报告: {csv_path}")
可运行演示(补齐 Mock 数据与 print 反馈):
import csv
import json
import tempfile
from pathlib import Path
# Step 6:JSON 给程序读,CSV 给表格软件读,同一份结果输出两种格式。
def write_report(freq: dict[str, int], out_dir: Path) -> None:
# JSON
json_path = out_dir / "report.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump({"top_words": freq}, f, ensure_ascii=False, indent=2)
print(f"JSON 报告: {json_path}")
# CSV
csv_path = out_dir / "report.csv"
rows = [{"word": w, "count": c} for w, c in freq.items()]
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["word", "count"])
writer.writeheader()
writer.writerows(rows)
print(f"CSV 报告: {csv_path}")
with tempfile.TemporaryDirectory() as tmpdir:
out = Path(tmpdir)
freq = {"知识库": 5, "AI": 3, "pipeline": 2}
write_report(freq, out)
print("JSON内容:", (out / "report.json").read_text(encoding="utf-8").splitlines()[0])
print("CSV首行:", (out / "report.csv").read_text(encoding="utf-8").splitlines()[0])
Step 7:用 SAMPLE_DOCS 和 build_test_kb 自动造测试知识库
痛点与机制:
教程不能要求新手“自己先准备一堆文件”。SAMPLE_DOCS 内置几篇小文档,build_test_kb() 用 tempfile 现场生成知识库目录。这样复制代码就能跑,跑完临时目录自动清理。
核心源码(逐字来自文末完整源码):
SAMPLE_DOCS: list[tuple[str, str]] = [
("intro.md", "# 知识库简介\n知识库是AI系统的核心组件,用于存储和检索知识。AI系统依赖知识库提供准确答案。"),
("retrieval.txt", "向量检索是知识库的关键技术。embedding向量化后存入向量数据库,检索时计算余弦相似度。"),
("pipeline.md", "AI pipeline包括数据清洗、embedding生成、向量存储、语义检索四个阶段。数据清洗是pipeline的第一步。"),
("faq.txt", "常见问题:知识库如何更新?知识库支持增量更新,无需重建全量索引。知识库的检索延迟如何优化?"),
]
def build_test_kb(tmpdir: Path) -> Path:
kb = tmpdir / "knowledge_base"
kb.mkdir()
sub = kb / "docs"
sub.mkdir()
for name, content in SAMPLE_DOCS:
(sub / name).write_text(content, encoding="utf-8")
return kb
可运行演示(补齐 Mock 数据与 print 反馈):
from pathlib import Path
import tempfile
# Step 7:Mock 文档像训练用小样本,不需要你手动准备真实文件夹。
SAMPLE_DOCS: list[tuple[str, str]] = [
("intro.md", "# 知识库简介\n知识库是AI系统的核心组件,用于存储和检索知识。AI系统依赖知识库提供准确答案。"),
("retrieval.txt", "向量检索是知识库的关键技术。embedding向量化后存入向量数据库,检索时计算余弦相似度。"),
("pipeline.md", "AI pipeline包括数据清洗、embedding生成、向量存储、语义检索四个阶段。数据清洗是pipeline的第一步。"),
("faq.txt", "常见问题:知识库如何更新?知识库支持增量更新,无需重建全量索引。知识库的检索延迟如何优化?"),
]
def build_test_kb(tmpdir: Path) -> Path:
kb = tmpdir / "knowledge_base"
kb.mkdir()
sub = kb / "docs"
sub.mkdir()
for name, content in SAMPLE_DOCS:
(sub / name).write_text(content, encoding="utf-8")
return kb
with tempfile.TemporaryDirectory() as tmpdir:
kb = build_test_kb(Path(tmpdir))
print("知识库目录:", kb)
for path in sorted(kb.rglob("*")):
print("-", path.relative_to(kb))
Step 8:用 main 把扫描、展示、报告生成串成 CLI 闭环
痛点与机制:
main() 是完整脚本入口:解析命令行参数,创建临时知识库,扫描词频,必要时写出 JSON/CSV,再验证 JSON 能读回来。新手只要会改 --mode scan/report,就能控制整个工具。
核心源码(逐字来自文末完整源码):
def main() -> None:
parser = argparse.ArgumentParser(description="知识库文件扫描器")
parser.add_argument("--mode", choices=["scan", "report"], default="scan",
help="scan=仅扫描打印, report=同时写入JSON/CSV")
args = parser.parse_args()
with tempfile.TemporaryDirectory() as tmpdir:
tmp = Path(tmpdir)
kb_dir = build_test_kb(tmp)
print(f"测试知识库路径: {kb_dir}")
freq = scan_kb(kb_dir)
print_table(freq)
if args.mode == "report":
out = tmp / "output"
out.mkdir()
write_report(freq, out)
# 验证写入
loaded = json.loads((out / "report.json").read_text(encoding="utf-8"))
print(f"\n验证 JSON 读回: top1 = {list(loaded['top_words'].items())[0]}")
print("\n[完成] 临时目录已自动清理,零副作用。")
可运行演示(补齐 Mock 数据与 print 反馈):
import argparse
import csv
import json
import re
import tempfile
from collections import Counter
from pathlib import Path
class KBError(Exception):
"""知识库基础异常"""
class FileParseError(KBError):
def __init__(self, path: str, reason: str) -> None:
super().__init__(f"解析失败 [{path}]: {reason}")
self.path = path
def tokenize(text: str) -> list[str]:
"""简单分词:提取长度≥2的中英文词"""
return re.findall(r"[a-zA-Z]{2,}|[\u4e00-\u9fff]{2,}", text)
def read_safe(path: Path) -> str:
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError as e:
raise FileParseError(str(path), f"编码错误: {e}") from e
except PermissionError as e:
raise FileParseError(str(path), f"无权限: {e}") from e
def scan_kb(kb_dir: Path) -> dict[str, int]:
"""扫描目录下所有 .txt/.md 文件,返回全局词频"""
counter: Counter[str] = Counter()
files = list(kb_dir.rglob("*.txt")) + list(kb_dir.rglob("*.md"))
print(f"发现文件: {len(files)} 个")
for f in files:
try:
text = read_safe(f)
words = tokenize(text)
counter.update(words)
except FileParseError as e:
print(f" [WARN] {e}")
return dict(counter.most_common(20))
def print_table(freq: dict[str, int]) -> None:
print(f"\n{'词语':<16} {'频次':>6}")
print("─" * 24)
for word, cnt in freq.items():
print(f"{word:<16} {cnt:>6}")
def write_report(freq: dict[str, int], out_dir: Path) -> None:
# JSON
json_path = out_dir / "report.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump({"top_words": freq}, f, ensure_ascii=False, indent=2)
print(f"JSON 报告: {json_path}")
# CSV
csv_path = out_dir / "report.csv"
rows = [{"word": w, "count": c} for w, c in freq.items()]
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["word", "count"])
writer.writeheader()
writer.writerows(rows)
print(f"CSV 报告: {csv_path}")
SAMPLE_DOCS: list[tuple[str, str]] = [
("intro.md", "# 知识库简介\n知识库是AI系统的核心组件,用于存储和检索知识。AI系统依赖知识库提供准确答案。"),
("retrieval.txt", "向量检索是知识库的关键技术。embedding向量化后存入向量数据库,检索时计算余弦相似度。"),
("pipeline.md", "AI pipeline包括数据清洗、embedding生成、向量存储、语义检索四个阶段。数据清洗是pipeline的第一步。"),
("faq.txt", "常见问题:知识库如何更新?知识库支持增量更新,无需重建全量索引。知识库的检索延迟如何优化?"),
]
def build_test_kb(tmpdir: Path) -> Path:
kb = tmpdir / "knowledge_base"
kb.mkdir()
sub = kb / "docs"
sub.mkdir()
for name, content in SAMPLE_DOCS:
(sub / name).write_text(content, encoding="utf-8")
return kb
# Step 8:main 是总控台,--mode 决定只扫描还是生成报告。
def main() -> None:
parser = argparse.ArgumentParser(description="知识库文件扫描器")
parser.add_argument("--mode", choices=["scan", "report"], default="scan",
help="scan=仅扫描打印, report=同时写入JSON/CSV")
args = parser.parse_args()
with tempfile.TemporaryDirectory() as tmpdir:
tmp = Path(tmpdir)
kb_dir = build_test_kb(tmp)
print(f"测试知识库路径: {kb_dir}")
freq = scan_kb(kb_dir)
print_table(freq)
if args.mode == "report":
out = tmp / "output"
out.mkdir()
write_report(freq, out)
# 验证写入
loaded = json.loads((out / "report.json").read_text(encoding="utf-8"))
print(f"\n验证 JSON 读回: top1 = {list(loaded['top_words'].items())[0]}")
print("\n[完成] 临时目录已自动清理,零副作用。")
import sys
sys.argv = ["prog", "--mode", "report"]
main()
极客实战:完整源码与运行
现在,把上面的积木拼起来,将以下完整代码放进你的编辑器,运行它。先看整体闭环,再回头逐段改参数,你会更容易建立工程直觉。
#!/usr/bin/env python3
"""
知识库文件扫描器
用法:
python3 09-python-io.py --mode scan # 扫描并统计词频
python3 09-python-io.py --mode report # 生成 JSON + CSV 报告
"""
import argparse
import csv
import json
import re
import tempfile
from collections import Counter
from pathlib import Path
# ── 自定义异常 ────────────────────────────────────────────────
class KBError(Exception):
"""知识库基础异常"""
class FileParseError(KBError):
def __init__(self, path: str, reason: str) -> None:
super().__init__(f"解析失败 [{path}]: {reason}")
self.path = path
# ── 核心逻辑 ──────────────────────────────────────────────────
def tokenize(text: str) -> list[str]:
"""简单分词:提取长度≥2的中英文词"""
return re.findall(r"[a-zA-Z]{2,}|[\u4e00-\u9fff]{2,}", text)
def read_safe(path: Path) -> str:
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError as e:
raise FileParseError(str(path), f"编码错误: {e}") from e
except PermissionError as e:
raise FileParseError(str(path), f"无权限: {e}") from e
def scan_kb(kb_dir: Path) -> dict[str, int]:
"""扫描目录下所有 .txt/.md 文件,返回全局词频"""
counter: Counter[str] = Counter()
files = list(kb_dir.rglob("*.txt")) + list(kb_dir.rglob("*.md"))
print(f"发现文件: {len(files)} 个")
for f in files:
try:
text = read_safe(f)
words = tokenize(text)
counter.update(words)
except FileParseError as e:
print(f" [WARN] {e}")
return dict(counter.most_common(20))
def print_table(freq: dict[str, int]) -> None:
print(f"\n{'词语':<16} {'频次':>6}")
print("─" * 24)
for word, cnt in freq.items():
print(f"{word:<16} {cnt:>6}")
def write_report(freq: dict[str, int], out_dir: Path) -> None:
# JSON
json_path = out_dir / "report.json"
with open(json_path, "w", encoding="utf-8") as f:
json.dump({"top_words": freq}, f, ensure_ascii=False, indent=2)
print(f"JSON 报告: {json_path}")
# CSV
csv_path = out_dir / "report.csv"
rows = [{"word": w, "count": c} for w, c in freq.items()]
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["word", "count"])
writer.writeheader()
writer.writerows(rows)
print(f"CSV 报告: {csv_path}")
# ── 测试数据 ──────────────────────────────────────────────────
SAMPLE_DOCS: list[tuple[str, str]] = [
("intro.md", "# 知识库简介\n知识库是AI系统的核心组件,用于存储和检索知识。AI系统依赖知识库提供准确答案。"),
("retrieval.txt", "向量检索是知识库的关键技术。embedding向量化后存入向量数据库,检索时计算余弦相似度。"),
("pipeline.md", "AI pipeline包括数据清洗、embedding生成、向量存储、语义检索四个阶段。数据清洗是pipeline的第一步。"),
("faq.txt", "常见问题:知识库如何更新?知识库支持增量更新,无需重建全量索引。知识库的检索延迟如何优化?"),
]
def build_test_kb(tmpdir: Path) -> Path:
kb = tmpdir / "knowledge_base"
kb.mkdir()
sub = kb / "docs"
sub.mkdir()
for name, content in SAMPLE_DOCS:
(sub / name).write_text(content, encoding="utf-8")
return kb
# ── 入口 ──────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(description="知识库文件扫描器")
parser.add_argument("--mode", choices=["scan", "report"], default="scan",
help="scan=仅扫描打印, report=同时写入JSON/CSV")
args = parser.parse_args()
with tempfile.TemporaryDirectory() as tmpdir:
tmp = Path(tmpdir)
kb_dir = build_test_kb(tmp)
print(f"测试知识库路径: {kb_dir}")
freq = scan_kb(kb_dir)
print_table(freq)
if args.mode == "report":
out = tmp / "output"
out.mkdir()
write_report(freq, out)
# 验证写入
loaded = json.loads((out / "report.json").read_text(encoding="utf-8"))
print(f"\n验证 JSON 读回: top1 = {list(loaded['top_words'].items())[0]}")
print("\n[完成] 临时目录已自动清理,零副作用。")
if __name__ == "__main__":
main()
运行示例:
python3 09-python-io.py --mode scan
python3 09-python-io.py --mode report
关键点速查
| 场景 | 推荐方式 |
|---|---|
| 路径操作 | pathlib.Path |
| 文件读写 | with open(...) as f |
| 编码安全 | encoding="utf-8" 显式指定 |
| 结构化数据 | json / csv 标准库 |
| 测试隔离 | tempfile.TemporaryDirectory() |
| 异常链 | raise NewError(...) from original |
⏱ NexDo Time · 5 分钟微操
- 把
SAMPLE_DOCS里的一个文件内容改成乱码字节(b"\xff\xfe"),观察FileParseError是否被正确捕获并打印[WARN]。 - 在
scan_kb中新增对.json文件的支持,用json.load读取后把所有字符串值拼接再分词。 - 修改
print_table,在频次列右侧加一列 ASCII 进度条(█字符,最高频=20格)。
Don’t wait for next time, do it in the next moment.