不会吧,不会吧,都已经2024了,还有人在用os.path.join?不知道pathlib.Path有多香吗?

兵马未动,粮草先行:Life is short, enjoy yourself~

今天看到这样一段类似这样的代码:

import os
import shutil

work_dir = "E:\\代码\\hello world"
data_dir = os.path.join(os.path.join(work_dir, "datasets"), "xmls")

for name in os.listdir(data_dir):
    if not name.endswith(".xml"):
        continue
    file = os.path.join(data_dir, name)
    with open(file, "r", encoding="utf-8") as f:
        lines = f.readlines()
    if not any(i.strip().startswith("<") for i in lines):
        category = os.path.split(data_dir)[-1]
        folder = "broken_" + category
        out_dir = os.path.join(os.path.dirname(data_dir), folder)
        os.makedirs(out_dir, exist_ok=True)
        new_path = os.path.join(out_dir, name + ".txt")
        shutil.move(file, new_path)
        print(f"Move {file} to {new_path}")

忍不住吐槽了一下,并强力推荐使用Python3.4以后加入标准库的pathlib

上例中原本20行的代码,改用pathlib的话,只需12行:

from pathlib import Path

work_dir = "E:\\代码\\hello world"
data_dir = Path(work_dir, "datasets", "xmls")

for p in data_dir.glob("*.xml"):
    lines = p.read_text(encoding="utf-8").splitlines()
    if not any(i.strip().startswith("<") for i in lines):
        out_dir = data_dir.parent / f"broken_{data_dir.name}"
        out_dir.mkdir(parents=True, exist_ok=True)
        new_path = p.rename(out_dir / f"{p.name}.txt")
        print(f"Move {p} to {new_path}")

=========================================================================

os vs pathlib

1. 获取上级目录、上上级目录

import os

current_dir = os.getcwd() # 当前目录
assert isinstance(current_dir, str)
parent_dir = os.path.dirname(current_dir) # 获取上级目录
grandparent_dir = os.path.dirname(os.path.dirname(current_dir)) # 上上级目录

# --------------------------------------------------------------
from pathlib import Path
current_dir = Path.cwd() # 当前目录
assert isinstance(current_dir, Path)
parent_dir = current_dir.parent # 上一级目录
grandparent_dir = current_dir.parent.parent # 上上级

2. 遍历某个路径下的某类文件

import os

target_dir = "D:\\Documents\\"
suffix = ".py"

# 一级遍历
for name in os.listdir(target_dir):
    assert os.sep not in name # 只是文件名,不包含路径
    if not name.endswith(suffix):
        continue
    file_path = os.path.join(target_dir, name)
    assert os.path.exists(file_path)


# 嵌套遍历
for root, dirs, files in os.walk(target_dir):
    for name in files:
        if not name.endswith(suffix):
            continue
        file_path = os.path.join(root, name)
        assert os.path.isfile(file_path)

# ------------------------------------------------
from pathlib import Path

# 一级遍历
for file in Path(target_dir).glob(f'*{suffix}'):
    assert isinstance(file, Path) and os.sep in str(file) # 包含路径
    name = file.name # 文件名
    assert isinstance(name, str) and os.sep not in name
    assert file.exists() # 判断是否存在

# 嵌套遍历
for file in Path(target_dir).rglob(f'*{suffix}'):
    if not file.is_file(): # 文件夹名称为xxx.py的也会遍历出来
        continue
    assert file.name.endswith(suffix)

3. 路径拼接

import os

target = "E:\\python scripts\\simple\\print_heart.py"
current_dir = "E:\\Downloads"

joined = os.path.join(os.path.dirname(current_dir), 'python scripts', 'simple', 'print_heart.py')
joined2 = os.path.join(os.path.dirname(current_dir), 'python scripts\\simple\\print_heart.py')
assert joined == joined2 == target

# ---------------------------------
from pathlib import Path

target = Path("E:\\python scripts\\simple\\print_heart.py")
current_dir = Path("E:\\Downloads")

joined = current_dir.parent / 'python scripts' / 'simple' / 'print_heart.py'
joined2 = current_dir.parent / 'python scripts\\simple\\print_heart.py'
assert joined == joined2 == target

# pathlib也支持joinpath和Path('a', 'b', 'c')的方式
joined3 = current_dir.parent.joinpath('python scripts\\simple\\print_heart.py')
joined4 = current_dir.parent.joinpath('python scripts/simple/print_heart.py')
joined5 = current_dir.parent.joinpath('python scripts').joinpath('simple').joinpath('print_heart.py')

joined6 = Path(current_dir.parent, 'python scripts', 'simple', 'print_heart.py')
joined7 = Path(str(current_dir.parent), 'python scripts', 'simple', 'print_heart.py')
joined8 = Path(current_dir.parent, 'python scripts/simple/print_heart.py')
joined9 = Path(current_dir.parent, 'python scripts\\simple\\print_heart.py')

for i in (joined3, joined4, joined5, joined6, joined7, joined8, joined9):
    assert i == target

一些比较特殊的情况:

4. pathlib.Path的一些常用属性和方法

file = Path('E:/user/project/data/sample.txt') # '\\' 和 '/' 都可以
print(file.name)      # 输出:sample.txt
print(file.suffix)    # 输出:.txt
print(file.stem)      # 输出:sample
print(file.parent)    # 输出:E:\user\project\data
print(file.exists())  # 检查路径是否存在
print(file.is_file()) # 检查是否为文件
print(file.is_dir())  # 检查是否为目录
print(len(file.read_bytes())) # 输出文件大小

file.read_bytes()     # 读取二进制内容,返回bytes类型
file.read_text()      # 读取文本内容,返回str类型
file.write_bytes(b'') # 写入二进制
file.write_text('xx') # 写入文本
file.touch()          # 创建空文件
file.mkdir()          # 创建目录
file.parent.glob('*') # 遍历父目录,相当于os.listdir(os.path.dirname(file))
file.parent.rglob('*')# 嵌套遍历父目录及其所有子目录里的文件和文件夹 
file.unlink()         # 删除文件
file.parent.rmdir()   # 删除父目录(目录不为空的话,会报错)
file.with_suffix('.py') # 更换后缀,返回Path('E:/user/project/data/sample.py')
file.with_name('sample.py') # 更换文件名,返回Path('E:/user/project/data/sample.py')

更多示例见:pathlib — Object-oriented filesystem paths — Python 3.13.1 documentation

 打个广告:Ruff很好用,搭配mypy可以减少很多无脑错误,由于经常组合使用我还特意封装了一个便捷小工具fast-dev-cli

import fitz import re import os import argparse from multiprocessing import Pool, freeze_support def extract_name(pdf_path): """从单个PDF提取姓名""" doc = fitz.open(pdf_path) name = None for page in doc: text = page.get_text() match = re.search(r'姓名[::]?\s*\n*([^\s\n]+)', text) if match: name = match.group(1).strip() break doc.close() return name def process_single_pdf(args): """处理单个PDF的包装函数(进程兼容)""" input_path, output_dir = args file_name = os.path.basename(input_path) output_path = os.path.join(output_dir, file_name) target_name = extract_name(input_path) if not target_name: print(f"警告:{file_name} 未找到姓名信息") return doc = fitz.open(input_path) for page in doc: text_instances = page.search_for(target_name) for rect in text_instances: page.draw_rect(rect, color=(1,1,1), fill=(1,1,1), overlay=True) doc.save(output_path) doc.close() def batch_process(input_dir, workers=4): """批量处理入口函数""" # 创建输出目录 output_dir = os.path.join(input_dir, "processed_pdfs") os.makedirs(output_dir, exist_ok=True) # 获取所有PDF文件路径 pdf_files = [ os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.lower().endswith(".pdf") ] # 准备进程参数 task_args = [(f, output_dir) for f in pdf_files] # 使用进程池 with Pool(processes=workers) as pool: pool.map(process_single_pdf, task_args) if __name__ == "__main__": freeze_support() # 确保进程在打包后正常工作 parser = argparse.ArgumentParser(description="批量处理PDF文件,擦除敏感姓名信息") parser.add_argument("--input_dir", required=True, help="输入目录路径(包含待处理PDF)") parser.add_argument("--workers", type=int, default=4, help="并行工作进程数(默认4)") args = parser.parse_args() batch_process(args.input_dir)
03-29
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值