python多线程、多进程处理单个(大,超大)文件

本文探讨了使用Python的multiprocessing和threading模块实现文本分词的并发处理,通过`async_kd_tokenizer`函数,分别展示了多进程和多线程如何加速`如果蜗牛有爱情.txt`文件的编码。两种方法最终对比验证了它们在效率上的相似性。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1 多进程

import os
from multiprocessing import Pool


def safe_readline(f):
    pos = f.tell()
    while True:
        try:
            return f.readline()
        except UnicodeDecodeError:
            pos -= 1
            f.seek(pos)
            
            
def async_kd_tokenizer(filename, worker_id, num_workers):
    with open(filename, 'r') as f:
        size = os.fstat(f.fileno()).st_size  # 指针操作,所以无视文件大小
        print(f'size {size}')
        chunk_size = size // num_workers
        offset = worker_id * chunk_size
        end = offset + chunk_size
        f.seek(offset)
        print(f'offset {offset}')
        if offset > 0:
            safe_readline(f)    # drop first incomplete line
        lines = []
        line = f.readline()
        while line:
            line = line.replace(" ", '').replace("\n", '')
            if not line:
                line = f.readline()
                continue
            lines.append(line)
            if f.tell() > end:
                break
            line = f.readline()
        return lines
    
    
def encode_file(path, workers=4):
    assert os.path.exists(path)
    results = []
    workers_thread = []
    pool = Pool(processes=workers)
    for i in range(workers):
        w = pool.apply_async(
            async_kd_tokenizer,
            (path, i, workers),
        )
        workers_thread.append(w)
    pool.close()
    pool.join()

    for w in workers_thread:
        result = w.get() 
        results += result
    return results

results = encode_file('/Users/lisen/Downloads/如果蜗牛有爱情.txt', workers=4)
print(results)

2 多线程

import threading


class FileHandlerThread(threading.Thread):

    def __init__(self, func, args):
        super(FileHandlerThread, self).__init__()
        self.args = args
        self.func = func
        
    def run(self):
        self.result = self.func(*self.args)
    
    def get_result(self):
        try:
            return self.result
        except Exception:
            return None

    
def encode_file_thread(path, workers=4):
    assert os.path.exists(path)
    results = []
    workers_thread = []
    for i in range(workers):
        w = FileHandlerThread(async_kd_tokenizer, args=(path, i, workers))
        workers_thread.append(w)
        w.start()
    for w in workers_thread:
        w.join()
    for w in workers_thread:
        result = w.get_result() 
        results += result
    return results
	
results_th = encode_file_thread('/Users/lisen/Downloads/如果蜗牛有爱情.txt', workers=4)
print(results_th)

print(results_th == results)  # True

参考:https://www.liujiangblog.com/course/python/79

评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值