兵马未动,粮草先行:Life is short, enjoy yourself~
今天看到这样一段类似这样的代码:
import os
import shutil
work_dir = "E:\\代码\\hello world"
data_dir = os.path.join(os.path.join(work_dir, "datasets"), "xmls")
for name in os.listdir(data_dir):
if not name.endswith(".xml"):
continue
file = os.path.join(data_dir, name)
with open(file, "r", encoding="utf-8") as f:
lines = f.readlines()
if not any(i.strip().startswith("<") for i in lines):
category = os.path.split(data_dir)[-1]
folder = "broken_" + category
out_dir = os.path.join(os.path.dirname(data_dir), folder)
os.makedirs(out_dir, exist_ok=True)
new_path = os.path.join(out_dir, name + ".txt")
shutil.move(file, new_path)
print(f"Move {file} to {new_path}")
忍不住吐槽了一下,并强力推荐使用Python3.4以后加入标准库的pathlib
上例中原本20行的代码,改用pathlib的话,只需12行:
from pathlib import Path
work_dir = "E:\\代码\\hello world"
data_dir = Path(work_dir, "datasets", "xmls")
for p in data_dir.glob("*.xml"):
lines = p.read_text(encoding="utf-8").splitlines()
if not any(i.strip().startswith("<") for i in lines):
out_dir = data_dir.parent / f"broken_{data_dir.name}"
out_dir.mkdir(parents=True, exist_ok=True)
new_path = p.rename(out_dir / f"{p.name}.txt")
print(f"Move {p} to {new_path}")
=========================================================================
os vs pathlib:
1. 获取上级目录、上上级目录
import os
current_dir = os.getcwd() # 当前目录
assert isinstance(current_dir, str)
parent_dir = os.path.dirname(current_dir) # 获取上级目录
grandparent_dir = os.path.dirname(os.path.dirname(current_dir)) # 上上级目录
# --------------------------------------------------------------
from pathlib import Path
current_dir = Path.cwd() # 当前目录
assert isinstance(current_dir, Path)
parent_dir = current_dir.parent # 上一级目录
grandparent_dir = current_dir.parent.parent # 上上级
2. 遍历某个路径下的某类文件
import os
target_dir = "D:\\Documents\\"
suffix = ".py"
# 一级遍历
for name in os.listdir(target_dir):
assert os.sep not in name # 只是文件名,不包含路径
if not name.endswith(suffix):
continue
file_path = os.path.join(target_dir, name)
assert os.path.exists(file_path)
# 嵌套遍历
for root, dirs, files in os.walk(target_dir):
for name in files:
if not name.endswith(suffix):
continue
file_path = os.path.join(root, name)
assert os.path.isfile(file_path)
# ------------------------------------------------
from pathlib import Path
# 一级遍历
for file in Path(target_dir).glob(f'*{suffix}'):
assert isinstance(file, Path) and os.sep in str(file) # 包含路径
name = file.name # 文件名
assert isinstance(name, str) and os.sep not in name
assert file.exists() # 判断是否存在
# 嵌套遍历
for file in Path(target_dir).rglob(f'*{suffix}'):
if not file.is_file(): # 文件夹名称为xxx.py的也会遍历出来
continue
assert file.name.endswith(suffix)
3. 路径拼接
import os
target = "E:\\python scripts\\simple\\print_heart.py"
current_dir = "E:\\Downloads"
joined = os.path.join(os.path.dirname(current_dir), 'python scripts', 'simple', 'print_heart.py')
joined2 = os.path.join(os.path.dirname(current_dir), 'python scripts\\simple\\print_heart.py')
assert joined == joined2 == target
# ---------------------------------
from pathlib import Path
target = Path("E:\\python scripts\\simple\\print_heart.py")
current_dir = Path("E:\\Downloads")
joined = current_dir.parent / 'python scripts' / 'simple' / 'print_heart.py'
joined2 = current_dir.parent / 'python scripts\\simple\\print_heart.py'
assert joined == joined2 == target
# pathlib也支持joinpath和Path('a', 'b', 'c')的方式
joined3 = current_dir.parent.joinpath('python scripts\\simple\\print_heart.py')
joined4 = current_dir.parent.joinpath('python scripts/simple/print_heart.py')
joined5 = current_dir.parent.joinpath('python scripts').joinpath('simple').joinpath('print_heart.py')
joined6 = Path(current_dir.parent, 'python scripts', 'simple', 'print_heart.py')
joined7 = Path(str(current_dir.parent), 'python scripts', 'simple', 'print_heart.py')
joined8 = Path(current_dir.parent, 'python scripts/simple/print_heart.py')
joined9 = Path(current_dir.parent, 'python scripts\\simple\\print_heart.py')
for i in (joined3, joined4, joined5, joined6, joined7, joined8, joined9):
assert i == target
一些比较特殊的情况:
4. pathlib.Path的一些常用属性和方法
file = Path('E:/user/project/data/sample.txt') # '\\' 和 '/' 都可以
print(file.name) # 输出:sample.txt
print(file.suffix) # 输出:.txt
print(file.stem) # 输出:sample
print(file.parent) # 输出:E:\user\project\data
print(file.exists()) # 检查路径是否存在
print(file.is_file()) # 检查是否为文件
print(file.is_dir()) # 检查是否为目录
print(len(file.read_bytes())) # 输出文件大小
file.read_bytes() # 读取二进制内容,返回bytes类型
file.read_text() # 读取文本内容,返回str类型
file.write_bytes(b'') # 写入二进制
file.write_text('xx') # 写入文本
file.touch() # 创建空文件
file.mkdir() # 创建目录
file.parent.glob('*') # 遍历父目录,相当于os.listdir(os.path.dirname(file))
file.parent.rglob('*')# 嵌套遍历父目录及其所有子目录里的文件和文件夹
file.unlink() # 删除文件
file.parent.rmdir() # 删除父目录(目录不为空的话,会报错)
file.with_suffix('.py') # 更换后缀,返回Path('E:/user/project/data/sample.py')
file.with_name('sample.py') # 更换文件名,返回Path('E:/user/project/data/sample.py')
更多示例见:pathlib — Object-oriented filesystem paths — Python 3.13.1 documentation
打个广告:Ruff很好用,搭配mypy可以减少很多无脑错误,由于经常组合使用我还特意封装了一个便捷小工具fast-dev-cli~