基于CUDA的高精度多序列比对算法实现与优化及双序列比对加速的效率与准确性研究-CSDN博客

目录
第1章绪论 4
1.1 引言 4
1.2 序列比对 4
1.2.1 双序列比对算法 4
1.2.2 多序列比对算法 5
1.3 CUDA平台 5
1.4 本文研究目标与内容 6
第2章双序列比对算法 4
2.1 序列比对 4
2.1.1 引入序列比对的原因 4
2.1.2 全局比对与局部比对 5
2.2 Needleman-Wunsch(NW)算法 5
2.2.1 NW算法简介 5
2.2.2 NW算法举例介绍 6
2.3 Smith-Waterman(SW)算法 7
2.3.1 SW算法简介 7
2.3.2 SW算法举例介绍 8
2.4 BLAST算法 9
2.5 本章小结 11
第3章多序列比对算法 13
3.1 CLUSTAL算法引言 13
3.2 CLUSTAL算法简述 14
3.2.1 两两比对 14
3.2.2 构建向导树 15
3.2.3 渐进式比对 16
3.3 本章小结 22
第4章 CUDA下实现CLUSTAL 23
4.1 CUDA编程 23
4.2 CUDA下CLUSTAL执行流程 25
4.3 CUDA实现CLUSTAL代码 25
4.3.1 kernel函数 25
4.3.2 CPU端代码 28
4.4 本章小结 29
第5章实验结果及分析 30
5.1 实验环境 30
5.1.1 实验硬件配置 30
5.1.2 实验运行环境 30
5.2 实验过程 30
5.2.1 实验数据信息 30
5.2.2 实验结果统计 30
5.2.3 实验截图 31
5.3 实验结果分析 33
5.4 本章小结 34
结论 35
致谢 36
参考文献 37
1.3CUDA平台
随着算法的更新迭代，越来越多的基因组序列也同时被研究人员测定，生物序列的数量变得越来越庞大，这对算法的性能与效率提出了更高的要求。与此同时,多核与众核体系架构的引入使得计算机的计算能力成倍增长，GPU[12] 的并行处理能力越来越强大,而且不再拘泥于传统的图形处理,而能开始辅助CPU 完成图形计算以外的运算，如并行执行某个方法等，如果能充分利用 GPU 的并行处理能力,与单纯的 CPU 计算相比, 计算性能可得到很大的提升，进而改良现有的算法，在计算速度加快的同时，获取高精度比较结果。
CUDA(Compute Unified Device Architecture)[13]，是显卡厂商NVIDIA推出的运算平台。CUDA是一种由NVIDIA推出的通用并行计算架构，该架构使GPU能够解决复杂的计算问题。它包含了CUDA指令集架构(ISA)以及GPU内部的并行计算引擎。开发人员可以使用C语言来为CUDA架构编写程序，由于C语言是应用最广泛的一种高级编程语言，所以具有较高的普适性，所编写出的程序也可以在支持CUDA的处理器上以超高性能运行。
1.4本文研究目标与内容
多序列比对算法需要消耗巨大的时间，本文将首先对传统的双序列比对算法进行研究，然后对精度最高的CLUSTAL算法进行原理分析与具体实现。之后，利用CUDA平台调度GPU实现的并行处理能力，对CLUSTAL调用双序列比对算法的方式进行改进，使其并行执行，以此来加速执行CLUSTAL算法。最后通过多次实验、统计时间、评估性能，结果表明，基于CUDA的多序列比对算法性能优于原始的算法性能。论文的内容具体安排如下:
第一章主要介绍了研究课题背景与历史、经典算法概要与CUDA平台的介绍，简要说明了研究的方法与主要研究方向、目的，最后对本文的工作进行简介。
第二章主要介绍生物信息学的基础:双序列比对，并对经典的双序列比对算法进行具体的分析。
第三章根据CLUSTAL算法的思想，分析CLUSTAL算法的原理，并用C++进行具体实现。
第四章利用CUDA平台调度GPU实现的并行处理能力，让CLUSTAL算法原本顺序执行双序列比对的过程并行执行，加速比对过程，实现算法优化效果
第五章采用多组数据，测试两张多序列比对算法的性能，通过实验结果，对基于CUDA的多序列比对算法进行评估。
第六章对全文进行总结，并对算法的不足之处进行分析，指出今后的研究方向。

#include <stdio.h>
#include<iostream>
#include <string>
#include<algorithm>
#include <vector>                          
#include <iterator> 
#include<ctime>
#include <windows.h>
//声明命名空间std
using namespace std;
#define MATCH 1
#define DIS_MATCH -1
#define INDEL -3
#define INDEL_CHAR '-'

class ResUnit {		//一次双序列比对后的结果
public:
	string str1;	//原始序列1
	string str2;	//原始序列2
	string res1;	//结果序列1
	string res2;	//结果序列2
	int score;		//序列总得分，反映两个序列的相似程度
	int tag;		//禁止迭代多次
};

class SingleSeq {	//一个序列被整合后的样子
public:
	string str;		//一个序列的原始序列
	string res;		//一个序列被整合后的样子

};

struct BacktrackingUnit {
	int goUp;		//是否向上回溯
	int goLeftUp;	//是否向左上回溯
	int goLeft;		//是否向左回溯
	int score;		//得分矩阵第(i, j)这个单元的分值
};


typedef struct BacktrackingUnit *unitLine;

int max3(int a, int b, int c);
ResUnit traceback(unitLine** item, const int i, const int j, string str1, string str2, string res1, string res2, int n, ResUnit resUnit);
ResUnit NeedlemanWunch(string str1, string str2);

struct SequenceUnit {
	string *str1;	//匹配序列1
	string *str2;	//匹配序列2
	int score;
};

int main() {
	ResUnit s = NeedlemanWunch("GGATCGA","GAATTCAGTTA");
	cout << s.res1 << endl;
	cout << s.res2 << endl;
	system("pause");
}


/**
比较三种路径之间谁最大

f(i-1,j-1),f(i-1,j)+indel,f(i,j-1)+indel
*/
int max3(int a, int b, int c) {
	int temp = a > b ? a : b;
	return temp > c ? temp : c;
}

/**
比较两个字符类型属于什么，match，dismatch，indel
*/
int myCompare(char a, char b) {
	if (a == b)
		return MATCH;
	else if (a == ' ' || b == ' ')
		return INDEL;
	else
		return DIS_MATCH;
}


ResUnit traceback(unitLine** item, const int i, const int j, string str1, string str2, string res1, string res2, int n, ResUnit resUnit) {
	unitLine temp = item[i][j];
	if (resUnit.tag != 1)
	{
		if (!(i || j)) {   // 到矩阵单元(0, 0)才算结束，这代表初始的两个字符串的每个字符都被比对到了

			resUnit.str1 = str1;
			resUnit.str2 = str2;
			resUnit.res1 = res1;
			resUnit.res2 = res2;
			resUnit.tag = 1;
			return resUnit;
		}
		if (temp->goUp) {    // 向上回溯一格
			res1 = str1[i - 1] + res1;
			res2 = INDEL_CHAR + res2;
			resUnit = traceback(item, i - 1, j, str1, str2, res1, res2, n + 1, resUnit);
		}
		if (temp->goLeftUp) {    // 向左上回溯一格 
			res1 = str1[i - 1] + res1;
			res2 = str2[j - 1] + res2;
			resUnit = traceback(item, i - 1, j - 1, str1, str2, res1, res2, n + 1, resUnit);
		}
		if (temp->goLeft) {    // 向左回溯一格
			res1 = INDEL_CHAR + res1;
			res2 = str2[j - 1] + res2;
			resUnit = traceback(item, i, j - 1, str1, str2, res1, res2, n + 1, resUnit);
		}
		return resUnit;
	}
	else
	{
		return resUnit;
	}

}


ResUnit NeedlemanWunch(string str1, string str2) {
	//字符串str1,str2长度
	const int m = str1.length();
	const int n = str2.length();

	int m1, m2, m3, mm;

	unitLine **unit;

	// 初始化
	if ((unit = (unitLine **)malloc(sizeof(unitLine*) * (m + 1))) == NULL) {
		fputs("Error: Out of space!\n", stderr);
		exit(1);
	}
	for (int i = 0; i <= m; i++) {
		if ((unit[i] = (unitLine *)malloc(sizeof(unitLine) * (n + 1))) == NULL) {
			fputs("Error: Out of space!\n", stderr);
			exit(1);
		}
		for (int j = 0; j <= n; j++) {
			if ((unit[i][j] = (unitLine)malloc(sizeof(struct BacktrackingUnit))) == NULL) {
				fputs("Error: Out of space!\n", stderr);
				exit(1);
			}
			unit[i][j]->goUp = 0;
			unit[i][j]->goLeftUp = 0;
			unit[i][j]->goLeft = 0;
		}
	}
	unit[0][0]->score = 0;
	for (int i = 1; i <= m; i++) {
		unit[i][0]->score = INDEL * i;
		unit[i][0]->goUp = 1;
	}
	for (int j = 1; j <= n; j++) {
		unit[0][j]->score = INDEL * j;
		unit[0][j]->goLeft = 1;
	}


	// 动态规划算法计算得分矩阵每个单元的分值
	for (int i = 1; i <= m; i++) {
		for (int j = 1; j <= n; j++) {
			m1 = unit[i - 1][j]->score + INDEL;
			m2 = unit[i - 1][j - 1]->score + myCompare(str1[i - 1], str2[j - 1]);
			m3 = unit[i][j - 1]->score + INDEL;
			mm = max3(m1, m2, m3);
			unit[i][j]->score = mm;
			//判断路径来源
			if (m1 == mm) unit[i][j]->goUp = 1;
			if (m2 == mm) unit[i][j]->goLeftUp = 1;
			if (m3 == mm) unit[i][j]->goLeft = 1;
		}
	}


	//开始回溯
	ResUnit res;
	res.tag = 0;
	res = traceback(unit, m, n, str1, str2, "", "", 0, res);
	res.score = unit[m][n]->score;


	//释放内存
	for (int i = 0; i <= m; i++) {
		for (int j = 0; j <= n; j++) {
			free(unit[i][j]);
		}
		free(unit[i]);
	}
	free(unit);

	//返回值
	return res;
}