[关闭]
@Spongcer 2015-03-13T12:32:08.000000Z 字数 3595 阅读 1724

Code Optimizer

Code


  1. //在一个DWORD中找到第一个(或最后一个)为1的位
  2. #ifdef USE_NEW_BITMAP
  3. static INLINE Count32_t bm_find_one_in_dword(Count32_t dwWord, Bool8_t bDir)
  4. {
  5. Count32_t dwPos = 0, dwShift = 0;
  6. if (0 == dwWord) return BM_NOT_FOUND;
  7. if (bDir)
  8. {
  9. dwShift = ((0 == (dwWord & 0x0000FFFF)) << 4);
  10. dwPos += dwShift;
  11. dwWord >>= dwShift;
  12. dwShift = ((0 == (dwWord & 0x00FF)) << 3);
  13. dwPos += dwShift;
  14. dwWord >>= dwShift;
  15. dwShift = ((0 == (dwWord & 0x0F)) << 2);
  16. dwPos += dwShift;
  17. dwWord >>= dwShift;
  18. dwShift = ((0 == (dwWord & 0x03)) << 1);
  19. dwPos += dwShift;
  20. dwWord >>= dwShift;
  21. dwShift = (0 == (dwWord & 0x01));
  22. dwPos += dwShift;
  23. dwWord >>= dwShift;
  24. }
  25. else
  26. {
  27. dwShift = ((0 == (dwWord & 0xFFFF0000)) << 4);
  28. dwPos += dwShift;
  29. dwWord <<= dwShift;
  30. dwShift = ((0 == (dwWord & 0xFF000000)) << 3);
  31. dwPos += dwShift;
  32. dwWord <<= dwShift;
  33. dwShift = ((0 == (dwWord & 0xF0000000)) << 2);
  34. dwPos += dwShift;
  35. dwWord <<= dwShift;
  36. dwShift = ((0 == (dwWord & 0xC0000000)) << 1);
  37. dwPos += dwShift;
  38. dwWord <<= dwShift;
  39. dwShift = (0 == (dwWord & 0x80000000));
  40. dwPos += dwShift;
  41. dwWord <<= dwShift;
  42. dwPos += ((dwWord & 0x80000000) != 0);
  43. dwPos = 32 - dwPos;
  44. }
  45. return dwPos;
  46. }
  47. #else
  48. static Count32_t bm_find_one_in_dword (Count32_t dwWord, Bool8_t bDir )
  49. {
  50. Count32_t i=0;
  51. Count32_t dwWord2;
  52. if (0 == dwWord) return BM_NOT_FOUND;
  53. if (bDir) {
  54. while(1) {
  55. dwWord2 = ((dwWord >> 1) << 1);
  56. if (dwWord == dwWord2) {
  57. dwWord >>= 1;
  58. i++;
  59. continue;
  60. }
  61. break;
  62. }
  63. } else {
  64. while(dwWord>1) {
  65. dwWord >>= 1;
  66. i++;
  67. }
  68. }
  69. return i;
  70. }
  71. #endif
  1. //采用4路SIMD进行内存清空
  2. #ifdef USE_SIMD_OPT
  3. #define VectorBATZeroNullBitmap(pvbVector) \
  4. { \
  5. Count32_t i = 0, j = 0, dwBlock = 0, dwSize = VECBAT_BITMAP_SIZE; \
  6. Count32P_t pdwNullBitmap = (pvbVector)->dwNullBitmap; \
  7. __m128i __mZero = _mm_setzero_si128(); \
  8. __m128i *__m_aNullBitmap = (__m128i *)pdwNullBitmap; \
  9. \
  10. dwBlock = dwSize >> 4; \
  11. \
  12. for (j = 0; j < dwBlock; j++) \
  13. { \
  14. _mm_storeu_si128(__m_aNullBitmap + i, __mZero); \
  15. _mm_storeu_si128(__m_aNullBitmap + i + 1, __mZero); \
  16. _mm_storeu_si128(__m_aNullBitmap + i + 2, __mZero); \
  17. _mm_storeu_si128(__m_aNullBitmap + i + 3, __mZero); \
  18. i += 4; \
  19. } \
  20. \
  21. i <<= 2; \
  22. \
  23. for (; i < dwSize; i++) \
  24. { \
  25. pdwNullBitmap[i] = 0; \
  26. } \
  27. }
  28. #else
  29. #define VectorBATZeroNullBitmap(pvbVector) MEMSET((pvbVector)->dwNullBitmap, 0, VECBAT_BITMAP_SIZE * sizeof(Count32_t))
  30. #endif
  1. //采用8路SIMD进行浮点数求和运算
  2. static INLINE Datum
  3. X_sum_vec_Double_t(X_AGG_VEC_ARGS)
  4. {
  5. Count_t i = 0;
  6. Double_t dtdst = 0;
  7. if (!bExistNull)
  8. {
  9. #if (defined(OSC_64BIT_ARCH) && defined(USE_SIMD_OPT))
  10. __m128d __mSum0 = _mm_setzero_pd();
  11. __m128d __mSum1 = _mm_setzero_pd();
  12. __m128d __mSum2 = _mm_setzero_pd();
  13. __m128d __mSum3 = _mm_setzero_pd();
  14. __m128d __mSum4 = _mm_setzero_pd();
  15. __m128d __mSum5 = _mm_setzero_pd();
  16. __m128d __mSum6 = _mm_setzero_pd();
  17. __m128d __mSum7 = _mm_setzero_pd();
  18. __m128d __mLoad0, __mLoad1, __mLoad2, __mLoad3;
  19. __m128d __mLoad4, __mLoad5, __mLoad6, __mLoad7;
  20. Double_t *pDatum = (Double_t *)dtlft;
  21. Count_t dwBlock = (dwCount >> 4);
  22. Count_t j = 0;
  23. for (j = 0; j < dwBlock; j++)
  24. {
  25. __mLoad0 = _mm_loadu_pd(pDatum + i);
  26. __mLoad1 = _mm_loadu_pd(pDatum + i + 2);
  27. __mLoad2 = _mm_loadu_pd(pDatum + i + 4);
  28. __mLoad3 = _mm_loadu_pd(pDatum + i + 6);
  29. __mLoad4 = _mm_loadu_pd(pDatum + i + 8);
  30. __mLoad5 = _mm_loadu_pd(pDatum + i + 10);
  31. __mLoad6 = _mm_loadu_pd(pDatum + i + 12);
  32. __mLoad7 = _mm_loadu_pd(pDatum + i + 14);
  33. __mSum0 = _mm_add_pd(__mSum0, __mLoad0);
  34. __mSum1 = _mm_add_pd(__mSum1, __mLoad1);
  35. __mSum2 = _mm_add_pd(__mSum2, __mLoad2);
  36. __mSum3 = _mm_add_pd(__mSum3, __mLoad3);
  37. __mSum4 = _mm_add_pd(__mSum4, __mLoad4);
  38. __mSum5 = _mm_add_pd(__mSum5, __mLoad5);
  39. __mSum6 = _mm_add_pd(__mSum6, __mLoad6);
  40. __mSum7 = _mm_add_pd(__mSum7, __mLoad7);
  41. i += 16;
  42. }
  43. __mSum0 = _mm_add_pd(__mSum0, __mSum1);
  44. __mSum2 = _mm_add_pd(__mSum2, __mSum3);
  45. __mSum4 = _mm_add_pd(__mSum4, __mSum5);
  46. __mSum6 = _mm_add_pd(__mSum6, __mSum7);
  47. __mSum0 = _mm_add_pd(__mSum0, __mSum2);
  48. __mSum4 = _mm_add_pd(__mSum4, __mSum6);
  49. __mSum0 = _mm_add_pd(__mSum0, __mSum4);
  50. pDatum = (Double_t *)(&__mSum0);
  51. dtdst = pDatum[0] + pDatum[1];
  52. #endif
  53. for(; i < dwCount; i++)
  54. {
  55. dtdst += (Double_t)DatumGetDouble_t(dtlft[i]);
  56. }
  57. }
  58. else
  59. {
  60. for(i = 0; i < dwCount; i++)
  61. {
  62. if (BMTestZero_HY(dwNull, i))
  63. {
  64. dtdst += (Double_t)DatumGetDouble_t(dtlft[i]);
  65. }
  66. }
  67. }
  68. return Double_tGetDatum(dtdst);
  69. }
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注