如何递归对比两个文件夹当中npy文件的内容

C__Try

于 2023-11-09 15:02:10 发布

阅读量310

点赞数

CC 4.0 BY-SA版权

文章标签： npy

本文链接：https://blog.csdn.net/C__Try/article/details/134312095

本文介绍了一个Python程序，用于在两个文件夹中查找具有最长共同前缀的Numpy文件，并比较它们的形状、数值差异和余弦相似度。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import os
import numpy as np
from scipy.spatial.distance import cosine
import csv

# 获取文件夹中所有文件的键值对映射
def get_file_mapping(folder_path):
    file_map = {}
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_map[file] = os.path.abspath(os.path.join(root, file))
    return file_map

# 找到两个文件名的最长匹配长度
def find_max_common_prefix_length(file1, file2):
    i = 0
    while i < len(file1) and i < len(file2) and file1[i] == file2[i]:
        i += 1
    return i

# 找到文件夹a和文件夹b中匹配的文件
def find_matching_files(folder_a, folder_b):
    a_map = get_file_mapping(folder_a)
    b_map = get_file_mapping(folder_b)

    matching_files = []

    for file_a, path_a in a_map.items():
        max_common_prefix_length = 0
        matching_file_b = ""

        for file_b, path_b in b_map.items():
            common_prefix_length = find_max_common_prefix_length(file_a, file_b)

            if common_prefix_length > max_common_prefix_length:
                max_common_prefix_length = common_prefix_length
                matching_file_b = file_b

        if matching_file_b:
            matching_files.append((path_a, b_map[matching_file_b]))
            # 删除已匹配的文件
            del b_map[matching_file_b]

    # 按最大匹配字符的字典序对匹配文件排序
    matching_files.sort(key=lambda x: os.path.basename(x[0]))

    return matching_files

# 比较两个npy文件的差异
def compare_npy_files(file_a, file_b):
    result_describe = ""

    array_a = np.load(file_a)
    array_b = np.load(file_b)

    if array_a.shape != array_b.shape:
        result_describe += f"shape {array_a.shape} {array_b.shape} a.shape==b.shape:{array_a.shape == array_b.shape}\n"
    else:
        result_describe += f"shape {array_a.shape} {array_b.shape}\n"

        max_difference = np.max(np.abs(array_a - array_b))
        min_difference = np.min(np.abs(array_a - array_b))
        average_difference = np.mean(np.abs(array_a - array_b))

        # 计算余弦相似度
        flat_array_a = array_a.ravel()
        flat_array_b = array_b.ravel()
        similarity = 1 - cosine(flat_array_a, flat_array_b)

        result_describe += f"max_value：{np.max(array_a)} {np.max(array_b)} {np.abs(np.max(array_a) - np.max(array_b))}\n"

        result_describe += f"min_value：{np.min(array_a)} {np.min(array_b)} {np.abs(np.min(array_a) - np.min(array_b))}\n"

        result_describe += f"avg_value：{np.average(array_a)} {np.average(array_b)} {np.abs(np.average(array_a) - np.average(array_b))}\n"

        result_describe += f"Max Value Difference：{max_difference}\n"

        result_describe += f"Min Value Difference：{min_difference}\n"

        result_describe += f"Average Error Difference：{average_difference}\n"

        result_describe += f"similarity: {similarity}\n"

    return result_describe

def main():
    folder_a = "a"
    folder_b = "b"
    matching_files = find_matching_files(folder_a, folder_b)

    for file_a, file_b in matching_files:
        print(f"匹配的文件：{os.path.basename(file_a)} 和 {os.path.basename(file_b)}")
        comparison_result = compare_npy_files(file_a, file_b)
        print(f"对比结果：\n{comparison_result}")
        print("")

        with open('comparison_results.csv', mode='a', newline='') as file:
            writer = csv.writer(file)
            comparison_result_lines = comparison_result.split('\n')
            writer.writerow([f"{os.path.basename(file_a)} and {os.path.basename(file_b)}"] + comparison_result_lines[1:-1])  # Skip the first and last lines


if __name__ == "__main__":
    main()