1 多进程
import os
from multiprocessing import Pool
def safe_readline(f):
pos = f.tell()
while True:
try:
return f.readline()
except UnicodeDecodeError:
pos -= 1
f.seek(pos)
def async_kd_tokenizer(filename, worker_id, num_workers):
with open(filename, 'r') as f:
size = os.fstat(f.fileno()).st_size # 指针操作,所以无视文件大小
print(f'size {size}')
chunk_size = size // num_workers
offset = worker_id * chunk_size
end = offset + chunk_size
f.seek(offset)
print(f'offset {offset}')
if offset > 0:
safe_readline(f) # drop first incomplete line
lines = []
line = f.readline()
while line:
line = line.replace(" ", '').replace("\n", '')
if not line:
line = f.readline()
continue
lines.append(line)
if f.tell() > end:
break
line = f.readline()
return lines
def encode_file(path, workers=4):
assert os.path.exists(path)
results = []
workers_thread = []
pool = Pool(processes=workers)
for i in range(workers):
w = pool.apply_async(
async_kd_tokenizer,
(path, i, workers),
)
workers_thread.append(w)
pool.close()
pool.join()
for w in workers_thread:
result = w.get()
results += result
return results
results = encode_file('/Users/lisen/Downloads/如果蜗牛有爱情.txt', workers=4)
print(results)
2 多线程
import threading
class FileHandlerThread(threading.Thread):
def __init__(self, func, args):
super(FileHandlerThread, self).__init__()
self.args = args
self.func = func
def run(self):
self.result = self.func(*self.args)
def get_result(self):
try:
return self.result
except Exception:
return None
def encode_file_thread(path, workers=4):
assert os.path.exists(path)
results = []
workers_thread = []
for i in range(workers):
w = FileHandlerThread(async_kd_tokenizer, args=(path, i, workers))
workers_thread.append(w)
w.start()
for w in workers_thread:
w.join()
for w in workers_thread:
result = w.get_result()
results += result
return results
results_th = encode_file_thread('/Users/lisen/Downloads/如果蜗牛有爱情.txt', workers=4)
print(results_th)
print(results_th == results) # True
参考:https://www.liujiangblog.com/course/python/79