Python多进程分片下载远端大文件 - multiprocessing paramiko

原创已于 2024-12-11 19:24:45 修改 · 1.2k 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#python #pycharm

于 2023-06-28 16:49:47 首次发布

随笔专栏收录该内容

55 篇文章

订阅专栏

文章介绍了如何利用Python的paramiko库进行SSH连接，通过SFTP协议实现大文件的分片下载，同时结合multiprocessing模块实现多进程并发下载，以提高下载效率。每个进程负责下载文件的一个片段，最终合并成完整的本地文件。代码示例展示了两种实现方式，一种是将多进程逻辑直接放在主函数中，另一种是封装成单独的下载函数。测试结果显示，多进程下载比单进程的getfo()方法更快。

Python多进程分片下载远端大文件，可以按照以下流程设计代码框架：

导入需要的模块：首先，导入所需的模块，包括paramiko、os和multiprocessing。
创建下载函数：创建一个用于分片下载文件的函数。该函数将使用SSH连接到远程服务器，并使用SFTP协议下载文件的指定分片到本地路径。使用sftp_file.seek() 和 file.seek() 确保正确的块被下载
主函数：在主函数中，您需要设置远程服务器的主机名、用户名、密码、远程文件路径和本地存储路径。还需要确定要划分的分片数量。我们使用os.cpu_count()获取当前系统的CPU数量，并将文件分割成块，数量与CPU个数一致。我们创建一个Process实例，每个实例负责一个文件块的下载。我们再次使用process.join()等待所有进程完成，在完成后输出一条完成信息。
运行主函数：在主函数中，我们创建了一个进程池，并使用Process方法来并发地调用分片下载函数。通过确定每个进程所负责的起始字节和结束字节，从而实现对服务器端文件的分片下载。然后，我们关闭并等待进程池中的所有任务完成。

Python多进程分片下载远端大文件源码（多进程直接写在main函数里）：

import paramiko
import multiprocessing
import time

def download_chunk_file(start_pos, end_pos, remote_path, local_path, ssh_info):
    print("download_chunk_file start")
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(**ssh_info)

    sftp = client.open_sftp()
    # open both local and remote file
    local_file = open(local_path, "r+b")
    remote_file = sftp.open(remote_path, "rb")

    # right shift to same start position
    local_file.seek(start_pos)
    remote_file.seek(start_pos)

    #print("start_pos ~ end_pos: {} ~ {}".format(start_pos, end_pos))
    while True:
        # read chunk file from remote file
        buffer = remote_file.read(end_pos - start_pos)
        if not buffer:
            break
        # write chunk file to local file
        local_file.write(buffer)
    print("chunk file with start_pos ~ end_pos: {} ~ {}, Download successfully!".format(start_pos, end_pos))
    remote_file.close()
    local_file.close()
    client.close()
    print("download_chunk_file end")

def main():
    print("main start")
    host = "host"
    port = 22
    username = "username"
    password = "password"

    remote_path = '/remote_dir/remote_file'
    local_path = '/local_dir/local_file'

    ssh_info = {
        "hostname": host,
        "port": port,
        "username": username,
        "password": password,
    }

    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(**ssh_info)
    sftp = client.open_sftp()

    file_size = sftp.stat(remote_path).st_size

    sftp.close()
    client.close()

    # get number of CPU with high efficiency
    processes = multiprocessing.cpu_count()
    #processes = 1
    print("number of CPU is {}".format(processes))
    # calculate the chunk size, n-1 processing handle each chunk size sub-file, and last processing handle last remaining sub-file
    chunk_size = file_size // processes

    with open(local_path, "wb") as f:
        f.truncate(file_size)

    multiprocess_download_start = time.time()
    process_list = []
    for i in range(processes):
        #print("process: {}".format(i))
        start_pos = i * chunk_size
        end_pos = (i + 1) * chunk_size if (i + 1) * chunk_size < file_size else file_size
        # multi processing to function download_chunk_file
        p = multiprocessing.Process(target=download_chunk_file, args=(start_pos, end_pos, remote_path, local_path, ssh_info))
        p.start()
        print(p)
        process_list.append(p)

    # wait for all the processes to finish
    for p in process_list:
        p.join()
        print(p)
    multiprocess_download_end = time.time()
    multiprocess_download_cost = multiprocess_download_end - multiprocess_download_start
    print("Full file Download successfully! Cost: {:.2f}s".format(multiprocess_download_cost))
    print("main end")
if __name__ == "__main__":
    main()

Python多进程分片下载远端大文件源码（多进程单独封装函数）：

import paramiko
import multiprocessing
import time

def get_remote_file_size(ssh_info, remote_path):
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect(**ssh_info)
    sftp = ssh.open_sftp()
    # get remote file size
    remote_file_size = sftp.stat(remote_path).st_size
    print ("remote_file_size:{}".format(remote_file_size))
    sftp.close()
    ssh.close()
    return remote_file_size

def download_chunk_file(ssh_info, remote_path, local_path, start_pos, end_pos):
    print("download_chunk_file start")
    client = paramiko.SSHClient()
    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    client.connect(**ssh_info)

    sftp = client.open_sftp()
    # open both local and remote file
    local_file = open(local_path, "r+b")
    remote_file = sftp.open(remote_path, "rb")

    # right shift to same start position
    local_file.seek(start_pos)
    remote_file.seek(start_pos)

    # print("start_pos ~ end_pos: {} ~ {}".format(start_pos, end_pos))
    while True:
        # read chunk file from remote file
        read_start = time.time()
        buffer = remote_file.read(end_pos - start_pos)
        if not buffer:
            break
        else:
            print("read  cost time {:.2f}s".format(time.time() - read_start))
        # write chunk file to local file
        write_start = time.time()
        local_file.write(buffer)
        print("write cost time {:.2f}s".format(time.time() - write_start))
    print("chunk file with start_pos ~ end_pos: {} ~ {}, Download successfully!".format(start_pos, end_pos))
    remote_file.close()
    local_file.close()
    client.close()
    print("download_chunk_file end")

def download_multiprocessing(ssh_info, remote_path, local_path):
    # get number of CPU with high efficiency
    num_processes = multiprocessing.cpu_count()
    #num_processes = 1
    print("number of CPU is {}, number of process is {}".format(multiprocessing.cpu_count(), num_processes))
    # get remote file size
    file_size = get_remote_file_size(ssh_info, remote_path)
    # create new empty local file, same size with remote file
    with open(local_path, "wb") as f:
        f.truncate(file_size)

    # calculate the chunk size, n-1 processing handle each chunk size sub-file, and last processing handle last remaining sub-file
    chunk_size = file_size // num_processes
    print("chunk_size is {}".format(chunk_size))
    # create number of process
    processes = []
    # create a process for each chunk
    for index in range(num_processes):
        #print("process: {}".format(index))
        start_pos = index * chunk_size
        end_pos = start_pos + chunk_size
        # last process will download the remaining bytes
        if index == num_processes - 1:
            end_pos = file_size - 1

        args = (ssh_info, remote_path, local_path, start_pos, end_pos)
        process = multiprocessing.Process(target=download_chunk_file, args=args)

        process.start()
        print(process)
        processes.append(process)

    # wait for all the processes to finish
    for process in processes:
        process.join()
        print(process)

def main():
    
    host = "host"
    port = 22
    username = "username"
    password = "password"

    remote_path = '/remote_dir/remote_file'
    local_path = '/local_dir/local_file'

    ssh_info = {
        "hostname": host,
        "port": port,
        "username": username,
        "password": password,
    }

    multiprocess_download_start = time.time()
    download_multiprocessing(ssh_info, remote_path, local_path)
    multiprocess_download_end = time.time()
    multiprocess_download_cost = multiprocess_download_end - multiprocess_download_start
    print("Full file Download successfully! Cost time: {:.2f}s".format(multiprocess_download_cost))


if __name__ == "__main__":
    main()

运行结果输出：

$ python multi_process_download_single_bigfile_def.py
number of CPU is 4
remote_file_size:63376366
chunk_size is 15844091
<Process(Process-1, started)>
download_chunk_file start
<Process(Process-2, started)>
download_chunk_file start
<Process(Process-3, started)>
<Process(Process-4, started)>
<Process(Process-1, started)>
download_chunk_file start
download_chunk_file start
read  cost time 6.19s
write cost time 0.01s
read  cost time 6.22s
write cost time 0.01s
read  cost time 6.20s
write cost time 0.01s
read  cost time 0.00s
write cost time 0.00s
read  cost time 0.00s
chunk file with start_pos ~ end_pos: 47532273 ~ 63376365, Download successfully!
download_chunk_file end
read  cost time 6.24s
write cost time 0.01s
read  cost time 4.25s
write cost time 0.01s
read  cost time 4.36s
write cost time 0.01s
read  cost time 4.34s
write cost time 0.01s
read  cost time 0.03s
write cost time 0.00s
read  cost time 0.00s
chunk file with start_pos ~ end_pos: 31688182 ~ 47532273, Download successfully!
download_chunk_file end
read  cost time 4.26s
write cost time 0.01s
read  cost time 0.00s
write cost time 0.00s
read  cost time 0.00s
chunk file with start_pos ~ end_pos: 15844091 ~ 31688182, Download successfully!
download_chunk_file end
read  cost time 4.39s
write cost time 0.01s
read  cost time 4.29s
write cost time 0.01s
read  cost time 0.00s
write cost time 0.00s
read  cost time 0.00s
chunk file with start_pos ~ end_pos: 0 ~ 15844091, Download successfully!
download_chunk_file end
<Process(Process-2, stopped)>
<Process(Process-3, stopped)>
<Process(Process-4, stopped)>
Full file Download successfully! Cost: 19.62s

参考：

Python paramiko文件传输显示上传下载进度信息 - print_Entropy-Go的博客-CSDN博客

Python paramiko实现文件的简单传输上传和下载代码_Entropy-Go的博客-CSDN博客

单进程处理时，建议直接使用getfo()函数下载，实测下载速度比read(), write()方法快很多。