[关闭]
@skyword 2017-04-28T12:35:17.000000Z 字数 8496 阅读 1129

第六次上机报告--模式匹配算法


问题描述

编写程序比较暴力匹配算法和KMP算法在匹配字符串的时候的比较次数,使用动态数组的顺序存储结构

算法思想

暴力匹配算法(BruteForce)的做法是逐个字符串匹配,当有主串某字符和模板串首字符相等是,向下比较下一字符;当匹配到某个位置出现不同时,回到原来的匹配位置的下一位重新匹配,理论复杂度,其中分别是主串和模板串的规模。

KMP算法对模板串定义了next数组,意义在于,当出现匹配失败的情况时,模板串的匹配下标不是回到初始位置,而是回到位置继续向下匹配。从而节省了不必要的比较,同时保证不会错过某些位置。理论复杂度

next数组的含义:对于下标j,的含义是给出了0~j-1中的最长公共前后缀,从而,下一次匹配时,我们直接回到最长前缀的下一位继续匹配即可

next数组的求法:递推方式。在求出之后:

代码设计

①"Dstring.h"头文件: 定义动态串结构体,并定义了以下函数:

②"main.cpp"主文件: 利用写好的Dstring,实现Brute-Force和KMP并比较

③两种算法比较次数的对比
要点如下:

程序代码

①Dstring.h

  1. #ifndef DSTRING_H_INCLUDED
  2. #define DSTRING_H_INCLUDED
  3. /*
  4. Index of string starts from 0.
  5. */
  6. #include <cstdio>
  7. #include <cstring>
  8. #include <stdexcept>
  9. #include <sstream>
  10. using namespace std;
  11. typedef struct
  12. {
  13. char *str;
  14. int maxLength;//Maximum capacity.
  15. int size;//The Number of Characters Dstring has now.
  16. }Dstring;
  17. void Initiate(Dstring *S, int mlen, char *str)//Initiate Dstring with size = len and str..
  18. {
  19. S->str = (char *)malloc(sizeof(char)*mlen);//Apply memory.
  20. S->maxLength = mlen;
  21. S->size = strlen(str);
  22. int len = S->size;
  23. for(int i = 0; i < len; i++)
  24. {
  25. S->str[i] = str[i];
  26. }
  27. }
  28. bool Insert(Dstring *S, int pos, Dstring T)//Insert Dstring T at the pos-th position of S.
  29. {
  30. if(pos < 0 || pos > S->size)//Illegal pos parameter.
  31. {
  32. ostringstream s;
  33. s<<"Illegal pos parameter."<<endl;
  34. throw invalid_argument(s.str());
  35. return false;
  36. }
  37. char *p;
  38. if(S->size + T.size > S->maxLength)//Apply more memory to store new string.
  39. {
  40. p = (char *)realloc(S->str, (S->size + T.size)*sizeof(char));
  41. if(p == NULL)
  42. {
  43. ostringstream s;
  44. s<<"System has run out of RAM."<<endl;
  45. throw invalid_argument(s.str());
  46. return false;
  47. }
  48. }
  49. for(int i = S->size - 1; i >= pos; i--)//move substring(pos, size-1) T.size units forward.
  50. S->str[i+T.size] = S->str[i];
  51. for(int i = 0; i < T.size; i++)//Insert characters 1 by 1.
  52. S->str[pos+i] = T.str[i];
  53. S->size += T.size;
  54. return true;
  55. }
  56. bool Delete(Dstring *S, int pos, int len)//Delete len units from pos to pos+len.
  57. {
  58. if(S->size <= 0)
  59. {
  60. ostringstream s;
  61. s<<"This string has already been empty."<<endl;
  62. throw invalid_argument(s.str());
  63. return false;
  64. }
  65. if(pos < 0 || len < 0 || pos + len > S->size)
  66. {
  67. ostringstream s;
  68. s<<"Illegal parameter : pos or len."<<endl;
  69. throw invalid_argument(s.str());
  70. return false;
  71. }
  72. else
  73. {
  74. for(int i = pos + len; i <= S->size-1; i++)
  75. S->str[i-len] = S->str[i];
  76. S->size -= len;
  77. return true;
  78. }
  79. }
  80. bool Substring(Dstring *S, int pos, int len, Dstring *T)//Get substring in S from position pos with length len, let T store it.
  81. {
  82. if(pos < 0 || len < 0 || pos + len > S->size)
  83. {
  84. ostringstream s;
  85. s<<"Illegal parameter : pos or len."<<endl;
  86. throw invalid_argument(s.str());
  87. return false;
  88. }
  89. else
  90. {
  91. for(int i = 0; i < len; i++)
  92. T->str[i] = S->str[pos+i];
  93. T->size = len;
  94. return true;
  95. }
  96. }
  97. void Destroy(Dstring *S)
  98. {
  99. free(S->str);
  100. S->size = 0;
  101. S->maxLength = 0;
  102. }
  103. void Dstring_print(Dstring *S)//Output.
  104. {
  105. int len = S->size;
  106. for(int i = 0; i < len; i++)
  107. {
  108. printf("%c",S->str[i]);
  109. }
  110. printf("\n");
  111. }
  112. #endif // DSTRING_H_INCLUDED

②main.cpp

  1. #include <iostream>
  2. #include <malloc.h>
  3. #include <stdexcept>
  4. #include <sstream>
  5. #include <cstdio>
  6. #include "Dstring.h"
  7. using namespace std;
  8. //Pattern-Match--Brutal Algorithm
  9. int Brute_Force_Match(Dstring S, Dstring T, int &cnt)
  10. {
  11. int i, j, pos;
  12. i = 0; j = 0;
  13. int lens = S.size;
  14. int lent = T.size;
  15. while(i < lens && j < lent)
  16. {
  17. if(cnt++ && S.str[i] == T.str[j])
  18. {
  19. i++;j++;
  20. }
  21. else
  22. {
  23. i = i - j + 1;
  24. j = 0;
  25. }
  26. }
  27. if(j == T.size) pos = i - T.size;
  28. else pos = -1;
  29. return pos;
  30. }
  31. //Pattern-Matching--KMP Algorithm
  32. void getNext(Dstring T, int nxt[], int &cnt)
  33. {
  34. int j = 1, k = 0;
  35. nxt[0] = -1;
  36. nxt[1] = 0;
  37. while(j < T.size)
  38. {
  39. if(cnt++ && T.str[j] == T.str[k])
  40. {
  41. nxt[j+1] = k + 1;
  42. j++;k++;
  43. }
  44. else if(k == 0)
  45. {
  46. nxt[j+1] = 0;
  47. j++;
  48. }
  49. else k = nxt[k];
  50. }
  51. }
  52. int KMP(Dstring S, Dstring T, int nxt[], int &cnt)
  53. {
  54. getNext(T, nxt, cnt);
  55. int i = 0, j = 0;
  56. while(i < S.size && j < S.size)
  57. {
  58. if(cnt++ && S.str[i] == T.str[j])
  59. {
  60. i++;j++;
  61. if(j == T.size)break;
  62. //add this to guarantee that algorithm return the position where pattern first appear.
  63. }
  64. else if(j == 0)i++;
  65. else j = nxt[j];
  66. }
  67. int pos;
  68. if(j == T.size)pos = i - T.size;
  69. else pos = -1;
  70. return pos;
  71. }
  72. int nxt[200];
  73. int main()
  74. {
  75. freopen("in.txt","r",stdin);
  76. Dstring a, b;
  77. int n, len, cnt1, cnt2;
  78. char *c = (char *)malloc(sizeof(char)*200);
  79. scanf("%d",&n);
  80. getchar();
  81. for(int index = 1; index <= n; index++)
  82. {
  83. cin>>c;
  84. len = strlen(c) + 1;
  85. //cout<<"**"<<len<<endl;
  86. Initiate(&a, len, c);
  87. cout<<endl;
  88. scanf("%s",c);
  89. len = strlen(c) + 1;
  90. Initiate(&b, len, c);
  91. cnt1 = cnt2 = 0;
  92. int pos1 = Brute_Force_Match(a, b, cnt1);
  93. int pos2 = KMP(a, b, nxt, cnt2);
  94. printf("Test case #%d:\n",index);
  95. //printf("Original String: ");
  96. //Dstring_print(&a);
  97. //printf("Patten String: ");
  98. //Dstring_print(&b);
  99. printf("Using BF Algorithm: %d , comparing %d times.\n", pos1, cnt1);
  100. printf("Using KMP Algorithm: %d , comparing %d times.\n", pos2, cnt2);
  101. }
  102. }

测试样例与测试结果

数据:随机生成的小规模数据

  1. 13
  2. abcdefg
  3. hijk
  4. abcdefg
  5. abcdefg
  6. abcdefg
  7. efg
  8. abcabc
  9. abc
  10. cdacacac
  11. caca
  12. cccc
  13. c
  14. ccdd
  15. cd
  16. fkajfjkkellfjkbnmffefilckajaafinncme
  17. ellfjkbnmffefi
  18. dhjelkd
  19. dwjidowwkjdnkjbja
  20. afewfefefecdfeffgthyttrwedcdfefsrfaffcdfefsggrdg
  21. cd
  22. aaaaaaaa
  23. aaaab
  24. cddcdc
  25. abcde
  26. fjaeislfhakjklfeaufjkejfujfehfjeukfheyfefgejhfefdhwdhwhdwhdadwhkjdhwjadhwkjadhwjkadhjkwahdjkwahdjaaaaaaaaaaaa
  27. whkjdhwjadhwkj

结果:
在不考虑next数组用到的匹配的时候

  1. Test case #1:
  2. Using BF Algorithm: -1 , comparing 7 times.
  3. Using KMP Algorithm: -1 , comparing 7 times.
  4. Test case #2:
  5. Using BF Algorithm: -1 , comparing 7 times.
  6. Using KMP Algorithm: -1 , comparing 7 times.
  7. Test case #3:
  8. Using BF Algorithm: 4 , comparing 7 times.
  9. Using KMP Algorithm: 4 , comparing 7 times.
  10. Test case #4:
  11. Using BF Algorithm: 3 , comparing 6 times.
  12. Using KMP Algorithm: 3 , comparing 6 times.
  13. Test case #5:
  14. Using BF Algorithm: 3 , comparing 7 times.
  15. Using KMP Algorithm: 3 , comparing 7 times.
  16. Test case #6:
  17. Using BF Algorithm: 1 , comparing 2 times.
  18. Using KMP Algorithm: 1 , comparing 2 times.
  19. Test case #7:
  20. Using BF Algorithm: 1 , comparing 3 times.
  21. Using KMP Algorithm: 1 , comparing 3 times.
  22. Test case #8:
  23. Using BF Algorithm: 8 , comparing 22 times.
  24. Using KMP Algorithm: 8 , comparing 22 times.
  25. Test case #9:
  26. Using BF Algorithm: -1 , comparing 7 times.
  27. Using KMP Algorithm: -1 , comparing 7 times.
  28. Test case #10:
  29. Using BF Algorithm: 10 , comparing 12 times.
  30. Using KMP Algorithm: 10 , comparing 12 times.
  31. Test case #11:
  32. Using BF Algorithm: -1 , comparing 20 times.
  33. Using KMP Algorithm: -1 , comparing 11 times.
  34. Test case #12:
  35. Using BF Algorithm: -1 , comparing 6 times.
  36. Using KMP Algorithm: -1 , comparing 6 times.
  37. Test case #13:
  38. Using BF Algorithm: 61 , comparing 80 times.
  39. Using KMP Algorithm: 61 , comparing 78 times.
  40. Process returned 0 (0x0) execution time : 0.054 s
  41. Press any key to continue.

在考虑了next的匹配次数之后

  1. Test case #1:
  2. Using BF Algorithm: -1 , comparing 7 times.
  3. Using KMP Algorithm: -1 , comparing 10 times.
  4. Test case #2:
  5. Using BF Algorithm: -1 , comparing 7 times.
  6. Using KMP Algorithm: 0 , comparing 13 times.
  7. Test case #3:
  8. Using BF Algorithm: 4 , comparing 7 times.
  9. Using KMP Algorithm: 4 , comparing 9 times.
  10. Test case #4:
  11. Using BF Algorithm: 3 , comparing 6 times.
  12. Using KMP Algorithm: 0 , comparing 5 times.
  13. Test case #5:
  14. Using BF Algorithm: 3 , comparing 7 times.
  15. Using KMP Algorithm: 3 , comparing 11 times.
  16. Test case #6:
  17. Using BF Algorithm: 1 , comparing 2 times.
  18. Using KMP Algorithm: 1 , comparing 2 times.
  19. Test case #7:
  20. Using BF Algorithm: 1 , comparing 3 times.
  21. Using KMP Algorithm: 1 , comparing 5 times.
  22. Test case #8:
  23. Using BF Algorithm: 8 , comparing 22 times.
  24. Using KMP Algorithm: 8 , comparing 36 times.
  25. Test case #9:
  26. Using BF Algorithm: -1 , comparing 7 times.
  27. Using KMP Algorithm: -1 , comparing 26 times.
  28. Test case #10:
  29. Using BF Algorithm: 10 , comparing 12 times.
  30. Using KMP Algorithm: 10 , comparing 13 times.
  31. Test case #11:
  32. Using BF Algorithm: -1 , comparing 20 times.
  33. Using KMP Algorithm: -1 , comparing 15 times.
  34. Test case #12:
  35. Using BF Algorithm: -1 , comparing 6 times.
  36. Using KMP Algorithm: -1 , comparing 10 times.
  37. Test case #13:
  38. Using BF Algorithm: 61 , comparing 80 times.
  39. Using KMP Algorithm: 61 , comparing 93 times.
  40. Process returned 0 (0x0) execution time : 0.043 s
  41. Press any key to continue.

在小规模数据下,两种算法差别并不大
随机生成了几组大规模数据进行测试,其中

  1. Test case #14:
  2. Original String Size: 100000
  3. Patten String Size: 30720
  4. Using BF Algorithm: 63136 , comparing 96413 times.
  5. Using KMP Algorithm: 63136 , comparing 128248 times.
  6. Test case #15:
  7. Original String Size: 100000
  8. Patten String Size: 54
  9. Using BF Algorithm: 91754 , comparing 110191 times.
  10. Using KMP Algorithm: 91754 , comparing 107137 times.
  11. Test case #16:
  12. Original String Size: 100000
  13. Patten String Size: 2048
  14. Using BF Algorithm: 48173 , comparing 97859 times.
  15. Using KMP Algorithm: 48173 , comparing 67859 times.
  16. Process returned 0 (0x0) execution time : 0.320 s
  17. Press any key to continue.

结论是在小规模测试中,求next数组带来的开销相对较大不可忽视,计入这一部分时,kmp算法的比较次数可能更多,在更大规模测试下(主串规模远远大于模式串),同时主串和模式串的元素种类较少时,KMP的效率才会比较明显的体现出来

动态数组设计方式和静态的区别

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注