@skyword
2017-04-28T04:35:17.000000Z
字数 8496
阅读 1324
编写程序比较暴力匹配算法和KMP算法在匹配字符串的时候的比较次数,使用动态数组的顺序存储结构
暴力匹配算法(BruteForce)的做法是逐个字符串匹配,当有主串某字符和模板串首字符相等是,向下比较下一字符;当匹配到某个位置出现不同时,回到原来的匹配位置的下一位重新匹配,理论复杂度,其中和分别是主串和模板串的规模。
KMP算法对模板串定义了next数组,意义在于,当出现匹配失败的情况时,模板串的匹配下标不是回到初始位置,而是回到位置继续向下匹配。从而节省了不必要的比较,同时保证不会错过某些位置。理论复杂度
next数组的含义:对于下标j,的含义是给出了0~j-1中的最长公共前后缀,从而,下一次匹配时,我们直接回到最长前缀的下一位继续匹配即可
next数组的求法:递推方式。在求出之后:
①"Dstring.h"头文件: 定义动态串结构体,并定义了以下函数:
②"main.cpp"主文件: 利用写好的Dstring,实现Brute-Force和KMP并比较
③两种算法比较次数的对比
要点如下:
①Dstring.h
#ifndef DSTRING_H_INCLUDED#define DSTRING_H_INCLUDED/*Index of string starts from 0.*/#include <cstdio>#include <cstring>#include <stdexcept>#include <sstream>using namespace std;typedef struct{char *str;int maxLength;//Maximum capacity.int size;//The Number of Characters Dstring has now.}Dstring;void Initiate(Dstring *S, int mlen, char *str)//Initiate Dstring with size = len and str..{S->str = (char *)malloc(sizeof(char)*mlen);//Apply memory.S->maxLength = mlen;S->size = strlen(str);int len = S->size;for(int i = 0; i < len; i++){S->str[i] = str[i];}}bool Insert(Dstring *S, int pos, Dstring T)//Insert Dstring T at the pos-th position of S.{if(pos < 0 || pos > S->size)//Illegal pos parameter.{ostringstream s;s<<"Illegal pos parameter."<<endl;throw invalid_argument(s.str());return false;}char *p;if(S->size + T.size > S->maxLength)//Apply more memory to store new string.{p = (char *)realloc(S->str, (S->size + T.size)*sizeof(char));if(p == NULL){ostringstream s;s<<"System has run out of RAM."<<endl;throw invalid_argument(s.str());return false;}}for(int i = S->size - 1; i >= pos; i--)//move substring(pos, size-1) T.size units forward.S->str[i+T.size] = S->str[i];for(int i = 0; i < T.size; i++)//Insert characters 1 by 1.S->str[pos+i] = T.str[i];S->size += T.size;return true;}bool Delete(Dstring *S, int pos, int len)//Delete len units from pos to pos+len.{if(S->size <= 0){ostringstream s;s<<"This string has already been empty."<<endl;throw invalid_argument(s.str());return false;}if(pos < 0 || len < 0 || pos + len > S->size){ostringstream s;s<<"Illegal parameter : pos or len."<<endl;throw invalid_argument(s.str());return false;}else{for(int i = pos + len; i <= S->size-1; i++)S->str[i-len] = S->str[i];S->size -= len;return true;}}bool Substring(Dstring *S, int pos, int len, Dstring *T)//Get substring in S from position pos with length len, let T store it.{if(pos < 0 || len < 0 || pos + len > S->size){ostringstream s;s<<"Illegal parameter : pos or len."<<endl;throw invalid_argument(s.str());return false;}else{for(int i = 0; i < len; i++)T->str[i] = S->str[pos+i];T->size = len;return true;}}void Destroy(Dstring *S){free(S->str);S->size = 0;S->maxLength = 0;}void Dstring_print(Dstring *S)//Output.{int len = S->size;for(int i = 0; i < len; i++){printf("%c",S->str[i]);}printf("\n");}#endif // DSTRING_H_INCLUDED
②main.cpp
#include <iostream>#include <malloc.h>#include <stdexcept>#include <sstream>#include <cstdio>#include "Dstring.h"using namespace std;//Pattern-Match--Brutal Algorithmint Brute_Force_Match(Dstring S, Dstring T, int &cnt){int i, j, pos;i = 0; j = 0;int lens = S.size;int lent = T.size;while(i < lens && j < lent){if(cnt++ && S.str[i] == T.str[j]){i++;j++;}else{i = i - j + 1;j = 0;}}if(j == T.size) pos = i - T.size;else pos = -1;return pos;}//Pattern-Matching--KMP Algorithmvoid getNext(Dstring T, int nxt[], int &cnt){int j = 1, k = 0;nxt[0] = -1;nxt[1] = 0;while(j < T.size){if(cnt++ && T.str[j] == T.str[k]){nxt[j+1] = k + 1;j++;k++;}else if(k == 0){nxt[j+1] = 0;j++;}else k = nxt[k];}}int KMP(Dstring S, Dstring T, int nxt[], int &cnt){getNext(T, nxt, cnt);int i = 0, j = 0;while(i < S.size && j < S.size){if(cnt++ && S.str[i] == T.str[j]){i++;j++;if(j == T.size)break;//add this to guarantee that algorithm return the position where pattern first appear.}else if(j == 0)i++;else j = nxt[j];}int pos;if(j == T.size)pos = i - T.size;else pos = -1;return pos;}int nxt[200];int main(){freopen("in.txt","r",stdin);Dstring a, b;int n, len, cnt1, cnt2;char *c = (char *)malloc(sizeof(char)*200);scanf("%d",&n);getchar();for(int index = 1; index <= n; index++){cin>>c;len = strlen(c) + 1;//cout<<"**"<<len<<endl;Initiate(&a, len, c);cout<<endl;scanf("%s",c);len = strlen(c) + 1;Initiate(&b, len, c);cnt1 = cnt2 = 0;int pos1 = Brute_Force_Match(a, b, cnt1);int pos2 = KMP(a, b, nxt, cnt2);printf("Test case #%d:\n",index);//printf("Original String: ");//Dstring_print(&a);//printf("Patten String: ");//Dstring_print(&b);printf("Using BF Algorithm: %d , comparing %d times.\n", pos1, cnt1);printf("Using KMP Algorithm: %d , comparing %d times.\n", pos2, cnt2);}}
数据:随机生成的小规模数据
13abcdefghijkabcdefgabcdefgabcdefgefgabcabcabccdacacaccacacccccccddcdfkajfjkkellfjkbnmffefilckajaafinncmeellfjkbnmffefidhjelkddwjidowwkjdnkjbjaafewfefefecdfeffgthyttrwedcdfefsrfaffcdfefsggrdgcdaaaaaaaaaaaabcddcdcabcdefjaeislfhakjklfeaufjkejfujfehfjeukfheyfefgejhfefdhwdhwhdwhdadwhkjdhwjadhwkjadhwjkadhjkwahdjkwahdjaaaaaaaaaaaawhkjdhwjadhwkj
结果:
在不考虑next数组用到的匹配的时候
Test case #1:Using BF Algorithm: -1 , comparing 7 times.Using KMP Algorithm: -1 , comparing 7 times.Test case #2:Using BF Algorithm: -1 , comparing 7 times.Using KMP Algorithm: -1 , comparing 7 times.Test case #3:Using BF Algorithm: 4 , comparing 7 times.Using KMP Algorithm: 4 , comparing 7 times.Test case #4:Using BF Algorithm: 3 , comparing 6 times.Using KMP Algorithm: 3 , comparing 6 times.Test case #5:Using BF Algorithm: 3 , comparing 7 times.Using KMP Algorithm: 3 , comparing 7 times.Test case #6:Using BF Algorithm: 1 , comparing 2 times.Using KMP Algorithm: 1 , comparing 2 times.Test case #7:Using BF Algorithm: 1 , comparing 3 times.Using KMP Algorithm: 1 , comparing 3 times.Test case #8:Using BF Algorithm: 8 , comparing 22 times.Using KMP Algorithm: 8 , comparing 22 times.Test case #9:Using BF Algorithm: -1 , comparing 7 times.Using KMP Algorithm: -1 , comparing 7 times.Test case #10:Using BF Algorithm: 10 , comparing 12 times.Using KMP Algorithm: 10 , comparing 12 times.Test case #11:Using BF Algorithm: -1 , comparing 20 times.Using KMP Algorithm: -1 , comparing 11 times.Test case #12:Using BF Algorithm: -1 , comparing 6 times.Using KMP Algorithm: -1 , comparing 6 times.Test case #13:Using BF Algorithm: 61 , comparing 80 times.Using KMP Algorithm: 61 , comparing 78 times.Process returned 0 (0x0) execution time : 0.054 sPress any key to continue.
在考虑了next的匹配次数之后
Test case #1:Using BF Algorithm: -1 , comparing 7 times.Using KMP Algorithm: -1 , comparing 10 times.Test case #2:Using BF Algorithm: -1 , comparing 7 times.Using KMP Algorithm: 0 , comparing 13 times.Test case #3:Using BF Algorithm: 4 , comparing 7 times.Using KMP Algorithm: 4 , comparing 9 times.Test case #4:Using BF Algorithm: 3 , comparing 6 times.Using KMP Algorithm: 0 , comparing 5 times.Test case #5:Using BF Algorithm: 3 , comparing 7 times.Using KMP Algorithm: 3 , comparing 11 times.Test case #6:Using BF Algorithm: 1 , comparing 2 times.Using KMP Algorithm: 1 , comparing 2 times.Test case #7:Using BF Algorithm: 1 , comparing 3 times.Using KMP Algorithm: 1 , comparing 5 times.Test case #8:Using BF Algorithm: 8 , comparing 22 times.Using KMP Algorithm: 8 , comparing 36 times.Test case #9:Using BF Algorithm: -1 , comparing 7 times.Using KMP Algorithm: -1 , comparing 26 times.Test case #10:Using BF Algorithm: 10 , comparing 12 times.Using KMP Algorithm: 10 , comparing 13 times.Test case #11:Using BF Algorithm: -1 , comparing 20 times.Using KMP Algorithm: -1 , comparing 15 times.Test case #12:Using BF Algorithm: -1 , comparing 6 times.Using KMP Algorithm: -1 , comparing 10 times.Test case #13:Using BF Algorithm: 61 , comparing 80 times.Using KMP Algorithm: 61 , comparing 93 times.Process returned 0 (0x0) execution time : 0.043 sPress any key to continue.
在小规模数据下,两种算法差别并不大
随机生成了几组大规模数据进行测试,其中
Test case #14:Original String Size: 100000Patten String Size: 30720Using BF Algorithm: 63136 , comparing 96413 times.Using KMP Algorithm: 63136 , comparing 128248 times.Test case #15:Original String Size: 100000Patten String Size: 54Using BF Algorithm: 91754 , comparing 110191 times.Using KMP Algorithm: 91754 , comparing 107137 times.Test case #16:Original String Size: 100000Patten String Size: 2048Using BF Algorithm: 48173 , comparing 97859 times.Using KMP Algorithm: 48173 , comparing 67859 times.Process returned 0 (0x0) execution time : 0.320 sPress any key to continue.
结论是在小规模测试中,求next数组带来的开销相对较大不可忽视,计入这一部分时,kmp算法的比较次数可能更多,在更大规模测试下(主串规模远远大于模式串),同时主串和模式串的元素种类较少时,KMP的效率才会比较明显的体现出来