我主要监控,C:\Users 下面的用户数据,多半是它增大了,怎么办?建立文件快照吧
它让自动扫描大于30MB的文件,下次生成的时候自动对比,增加,删除,修改了什么文件。
filesnapshot.py 文件如下:
import os
import json
import hashlib
import pandas as pd
from datetime import datetime
from pathlib import Path
class FileSnapshot:
def __init__(self, base_path):
self.base_path = Path(base_path)
self.snapshot_file = "initial_snapshot.json"
# def load_snapshot(self, snapshot_file):
# """加载快照文件"""
# with open(snapshot_file, 'r', encoding='utf-8') as f:
# return json.load(f)
def load_snapshot(self, snapshot_file):
"""加载快照文件(确保参数是字符串路径)"""
if not isinstance(snapshot_file, str):
raise TypeError(f"快照文件参数必须是字符串路径,但收到 {type(snapshot_file)}")
if not os.path.exists(snapshot_file):
raise FileNotFoundError(f"快照文件不存在: {snapshot_file}")
with open(snapshot_file, 'r', encoding='utf-8') as f:
return json.load(f)
def calculate_file_hash(self, file_path):
"""计算文件的MD5哈希值,用于检测内容变化"""
hash_md5 = hashlib.md5()
try:
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
except (PermissionError, IOError):
return "access_denied"
def create_snapshot(self, snapshot_name=None, min_size_mb=0):
"""创建当前文件系统快照
Args:
snapshot_name (str): 快照名称,默认为时间戳格式
min_size_mb (float): 最小文件大小阈值(MB),大于此值的文件才会被纳入快照
"""
if snapshot_name is None:
snapshot_name = datetime.now().strftime("snapshot_%Y%m%d_%H%M%S")
# 将 MB 转换为字节
min_size_bytes = min_size_mb * 1024 * 1024
snapshot_data = {
"timestamp": datetime.now().isoformat(),
"base_path": str(self.base_path),
"min_size_mb": min_size_mb,
"files": {}
}
print(f"正在创建快照: {snapshot_name} (仅包含大于 {min_size_mb}MB 的文件)")
file_count = 0
included_count = 0
total_size_bytes = 0
for file_path in self.base_path.rglob('*'):
if file_path.is_file():
try:
file_size = file_path.stat().st_size
file_count += 1
# 检查文件大小是否超过阈值
if file_size >= min_size_bytes:
rel_path = str(file_path.relative_to(self.base_path))
file_info = {
"size": file_size,
"size_mb": round(file_size / (1024 * 1024), 2),
"mtime": file_path.stat().st_mtime,
"ctime": file_path.stat().st_ctime,
"hash": self.calculate_file_hash(file_path)
}
snapshot_data["files"][rel_path] = file_info
included_count += 1
total_size_bytes += file_size
if included_count % 100 == 0:
print(f"已扫描 {file_count} 个文件,已包含 {included_count} 个符合要求的文件...")
else:
# 小文件计数,但不包含在快照中
if file_count % 1000 == 0:
print(f"已扫描 {file_count} 个文件,已包含 {included_count} 个符合要求的文件...")
except (PermissionError, OSError) as e:
# 记录错误文件但继续执行
rel_path = str(file_path.relative_to(self.base_path))
snapshot_data["files"][rel_path] = {"error": str(e)}
# 保存快照到文件
snapshot_filename = f"{snapshot_name}.json"
with open(snapshot_filename, 'w', encoding='utf-8') as f:
json.dump(snapshot_data, f, indent=2, ensure_ascii=False)
# 统计信息
total_size_mb = round(total_size_bytes / (1024 * 1024), 2)
print(f"快照完成!扫描 {file_count} 个文件,包含 {included_count} 个大于 {min_size_mb}MB 的文件")
print(f"总大小: {total_size_mb}MB,保存为: {snapshot_filename}")
return snapshot_filename
def compare_snapshots(self, old_snapshot_file, new_snapshot_file, output_format="excel"):
"""比较两个快照的差异"""
print(f"正在比较快照: {old_snapshot_file} 和 {new_snapshot_file}")
# 加载快照
old_data = self.load_snapshot(old_snapshot_file)
new_data = self.load_snapshot(new_snapshot_file)
old_files = old_data["files"]
new_files = new_data["files"]
# 分析差异
comparison_results = {
"added_files": [],
"deleted_files": [],
"modified_files": [],
"unchanged_files": []
}
all_files = set(old_files.keys()) | set(new_files.keys())
for file_path in all_files:
if file_path in new_files and file_path not in old_files:
# 新增文件
comparison_results["added_files"].append({
"file_path": file_path,
"size": new_files[file_path].get("size", 0),
"type": "新增"
})
elif file_path in old_files and file_path not in new_files:
# 删除文件
comparison_results["deleted_files"].append({
"file_path": file_path,
"size": old_files[file_path].get("size", 0),
"type": "删除"
})
elif file_path in old_files and file_path in new_files:
old_file = old_files[file_path]
new_file = new_files[file_path]
# 检查内容是否变化(通过哈希值)
if old_file.get("hash") != new_file.get("hash"):
comparison_results["modified_files"].append({
"file_path": file_path,
"old_size": old_file.get("size", 0),
"new_size": new_file.get("size", 0),
"size_change": new_file.get("size", 0) - old_file.get("size", 0),
"type": "修改"
})
else:
comparison_results["unchanged_files"].append({
"file_path": file_path,
"size": new_file.get("size", 0),
"type": "未变化"
})
# 生成报告
self.generate_report(comparison_results, old_snapshot_file, new_snapshot_file, output_format)
return comparison_results
def generate_report(self, results, old_snapshot, new_snapshot, format_type="excel"):
"""生成差异报告"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if format_type == "excel":
# 生成Excel报告
with pd.ExcelWriter(f"comparison_report_{timestamp}.xlsx") as writer:
for change_type, files in results.items():
if files: # 只输出有数据的sheet
df = pd.DataFrame(files)
df.to_excel(writer, sheet_name=change_type, index=False)
print(f"Excel报告已生成: comparison_report_{timestamp}.xlsx")
else:
# 生成文本报告
report_file = f"comparison_report_{timestamp}.txt"
with open(report_file, 'w', encoding='utf-8') as f:
f.write(f"文件系统差异分析报告\n")
f.write(f"对比时间: {datetime.now()}\n")
f.write(f"旧快照: {old_snapshot}\n")
f.write(f"新快照: {new_snapshot}\n")
f.write("="*50 + "\n\n")
for change_type, files in results.items():
if files:
f.write(f"\n【{change_type}】 - 共 {len(files)} 个文件\n")
for file_info in files:
f.write(f" - {file_info['file_path']}\n")
if 'size' in file_info:
f.write(f" 大小: {file_info['size']} bytes\n")
if 'size_change' in file_info:
f.write(f" 大小变化: {file_info['size_change']} bytes\n")
print(f"文本报告已生成: {report_file}")
# 格式化打印结果
def print_comparison_results(results):
print("=" * 60)
print("快照比较结果")
print("=" * 60)
print(f"\n新增文件 ({len(results['added_files'])}):")
for file in results["added_files"]:
print(f" {file['file_path']} - 大小: {file['size']} 字节")
print(f"\n删除文件 ({len(results['deleted_files'])}):")
for file in results["deleted_files"]:
print(f" {file['file_path']} - 大小: {file['size']} 字节")
print(f"\n修改文件 ({len(results['modified_files'])}):")
for file in results["modified_files"]:
print(f" {file['file_path']} - 大小变化: {file['size_change']} 字节 "
f"(旧: {file['old_size']}, 新: {file['new_size']})")
# print(f"\n未变化文件 ({len(results['unchanged_files'])}):")
# for file in results["unchanged_files"]:
# print(f" {file['file_path']} - 大小: {file['size']} 字节")
print("\n统计摘要:")
print(f" 总计文件: {sum(len(v) for v in results.values())}")
print(f" 新增: {len(results['added_files'])}")
print(f" 删除: {len(results['deleted_files'])}")
print(f" 修改: {len(results['modified_files'])}")
print(f" 未变化: {len(results['unchanged_files'])}")
# 使用示例
import argparse
def main(monitor_path):
#monitor_path = r"C:\Users\Administrator\AppData\Local"
print(f"开始扫描目录: {monitor_path}")
snapshot = FileSnapshot(monitor_path)
# 检查是否已有快照文件
initial_snapshot_file = "initial_snapshot.json"
if os.path.exists(initial_snapshot_file):
# 第二次运行:加载旧快照 → 创建新快照 → 对比
print("检测到已有快照,开始对比模式...")
# 2. 创建新快照
new_file = snapshot.create_snapshot("updated_snapshot", min_size_mb=30)
results = snapshot.compare_snapshots(initial_snapshot_file, new_file)
print_comparison_results(results)
else:
# 第一次运行:创建初始快照
print("首次运行,创建初始快照...")
snapshot_file = snapshot.create_snapshot("initial_snapshot", min_size_mb=30)
print(f"请妥善保存此文件: {snapshot_file}")
if __name__ == "__main__":
# 设置命令行参数解析
parser = argparse.ArgumentParser(description='文件监控程序')
parser.add_argument('path', type=str, help='要监控的目录路径')
# 解析参数
args = parser.parse_args()
# 调用主函数,并传入路径参数
main(args.path)
如果没有安装 pip 和 pandas 的需要安装pip 和 pandas库,相关安装请问AI。
再建一个BAT/CMD文件 扫描你想要对比的目录 :
@echo off
echo 正在创建初始快照...
rem filesnapshot.py "C:\Users\Administrator\AppData\Roaming"
filesnapshot.py "C:\Users\Administrator"
pause