1.引言
从数据库中加载数据,当数据量大的时候,通过多个线程并行加载缩短加载时间。
实际应用是从mysql中读取120万的用户数据.
单线程耗时:20秒
并行加载: 6秒.
2.方案
实现一个并行加载器.
实现方案要点:
- 数据分片: 要求记录有int型/long型的唯一字段,作为分片的key.每1万条记录作为一片
- 根据线程数,把数据分片尽量均匀分配到线程上
- 线程异常处理:
3.实现
3.1ParellelLoader
并行加载器类
@Data
@Slf4j
public class ParellelLoader {
final int RANGE_SIZE = 10000; ///< 分段统计大小,如每10000个id一个区段.统计每个区段的用户数
ParallelLoaderProvider<?> provider;
TimeCounter tc = new TimeCounter();
private int threadNum; ///< 处理线程数
private List<Range> pieceRanges = new ArrayList<>(); ///< 数据分片信息
private Object mutex = new Object();
private Throwable threadException; ///< 线程异常
public ParellelLoader(ParallelLoaderProvider<?> provider, int threadNum) {
this.provider = provider;
this.threadNum = threadNum;
}
@Data
class Range {
private Long low; ///< [low,high)
private Long high; ///< -1:表示不限制
public Range(long low,long high) {
this.low = low;
this.high = high;
}
private List<UserInfo> data; ///< 返回的数据
}
private void prepare() {
tc.start("query range");
List<IdRange> records = provider.queryRange(RANGE_SIZE);
tc.watch();
long recordCount = records.stream().mapToLong(IdRange::getCount).sum();
int pieceSize = (int) (recordCount/threadNum);
int cnt = 0;
Range lastRange = new Range(0,0);
Iterator<IdRange> it = records.iterator();
while(it.hasNext()) {
IdRange item = it.next();
cnt += item.getCount();
if (cnt>=pieceSize) {
Range range = new Range(lastRange.getHigh(),(item.getRange()+1)*RANGE_SIZE);
pieceRanges.add(range);
lastRange.setHigh(range.getHigh());
cnt = 0;
}
}
if (cnt>0) {
pieceRanges.add(new Range(lastRange.getHigh(),-1));
}
log.info("data divided {} pieces",pieceRanges.size());
return;
}
private void runTask() throws Exception {
CountDownLatch latch = new CountDownLatch(pieceRanges.size());
for (Range range : pieceRanges) {
Thread thr = new Thread(new Runnable() {
@SuppressWarnings("unchecked")
@Override
public void run() {
range.data = (List<UserInfo>) provider.getRangeData(range.getLow(),range.getHigh());
latch.countDown();
}
});
thr.setUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
@Override
public void uncaughtException(Thread t, Throwable e) {
synchronized(mutex) {
if (threadException==null)
threadException = e;
}
latch.countDown();
log.error("runTask exception:{}",e);
}
});
thr.start();
}
latch.await();
}
public List<UserInfo> getAll() {
List<UserInfo> users = new ArrayList<>();
for (Range range : pieceRanges) {
users.addAll(range.data);
}
return users;
}
///< 执行并行加载
public void run() throws Exception {
TimeCounter tc = new TimeCounter();
tc.start("prepare");
prepare();
tc.watchAndContinue("exec threads");
runTask();
tc.watch();
if (threadException!=null)
throw new Exception(threadException);
}
}
3.2ParallelLoaderProvider
并行加载器提供者接口
public interface ParallelLoaderProvider<T> {
List<IdRange> queryRange(int rangeSize);
List<T> getRangeData(long low, long high);
}
3.3IdRange
Id范围类,用于描述每个分片范围
@Data
public class IdRange {
private Integer range; ///< 范围
private Long count; ///< 记录数
public IdRange(int range, long count) {
this.range = range;
this.count = count;
}
}
4.使用示例
实现用户信息的并行加载.
提供具体加载器的实现和数据层的访问。
4.1ParallelUserLoaderProvider
并行用户加载器提供者.
public class ParallelUserLoaderProvider implements ParallelLoaderProvider<UserInfo> {
private UserIdRangeDAO repo;
public ParallelUserLoaderProvider() {
repo = Application.getBean(UserIdRangeDAO.class);
}
@Override
public List<IdRange> queryRange(int rangeSize) {
return repo.query(rangeSize);
}
@Override
public List<UserInfo> getRangeData(long low, long high) {
return high==-1 ? repo.getLastRangeUser((int)low) : repo.getRangeUser((int)low, (int)high);
}
}
4.2UserIdRangeDAO
用户数据访问对象,提供具体的查询实现
public interface UserIdRangeDAO extends JpaRepository<User,Integer> {
@Query(value="SELECT new clearing.parellel.IdRange(FLOOR(id/?1) AS range_id ,COUNT(*) as count) FROM User u GROUP BY FLOOR(id/?1) HAVING COUNT(u)>0 ORDER BY range_id")
List<IdRange> query(int rangeSize);
@Query(value = "select new clearing.query.UserInfo(a.id,b.promoterId,a.isBlocked,a.grade,a.nodeLevel,a.isValid,a.isGradeModified) from User a, Promotion b where a.id = b.accountId and a.id>=?1 and a.id<?2")
List<UserInfo> getRangeUser(Integer low, Integer high);
@Query(value = "select new clearing.query.UserInfo(a.id,b.promoterId,a.isBlocked,a.grade,a.nodeLevel,a.isValid,a.isGradeModified) from User a, Promotion b where a.id = b.accountId and a.id>=?1")
List<UserInfo> getLastRangeUser(Integer low);
}
4.3使用
并行加载用户信息,线程数为线程核心的2倍:
ParellelLoader loader = new ParellelLoader(new ParallelUserLoaderProvider(),2*Runtime.getRuntime().availableProcessors());
loader.run();
List<UserInfo> al = loader.getAll(); ///< 获取读取合并后的所有数据