[关闭]
@heavysheep 2020-11-03T11:15:29.000000Z 字数 12014 阅读 644

代码质量-2020年10月

代码质量


BAD CODE-1

代码信息

项目: 北京网信办数据中心
归属人: 王子
业务逻辑: 审核文本的多个方法类,包含对不同关键词、规则的匹配、同音、象形等匹配方法。
代码链接: https://git.datagrand.com/wbyy_group/bjwxb/bjwxb_alarm/tree/master/app/controller
位置: 各方法类

代码片段

  1. class WordController(object):
  2. data_path = "data"
  3. def __init__(self):
  4. self._element = "match"
  5. self._word_dict = self._load_dict()
  6. self._ac = self._compile_ac()
  7. def match(self, text: str):
  8. ...
  9. def parser(self, text):
  10. match_result = self.match(text)
  11. for k in match_result:
  12. match_result[k] = 0.99 if len(match_result[k]) > 1 else 0.9
  13. return match_result
  14. def _load_dict(self):
  15. ...
  16. def _compile_ac(self):
  17. ac = ahocorasick.Automaton()
  18. for w in self._word_dict.keys():
  19. if w.strip():
  20. ac.add_word(w, w)
  21. ac.make_automaton()
  22. return ac
  1. class PinYinController(object):
  2. data_path = "data"
  3. def __init__(self):
  4. self._element = "pinyin"
  5. self._pinyin_dict = self._load_dict()
  6. self._ac = self._compile_ac()
  7. def match(self, text: str):
  8. ...
  9. def parser(self, text):
  10. match_result = self.match(text)
  11. for k in match_result:
  12. match_result[k] = pow(2.2, len(match_result)) / 100
  13. return match_result
  14. def _load_dict(self):
  15. ...
  16. return pinyin_dict
  17. def _compile_ac(self):
  18. ac = ahocorasick.Automaton()
  19. for w in self._pinyin_dict.keys():
  20. if w.strip():
  21. ac.add_word(w, w)
  22. ac.make_automaton()
  23. return ac
  24. @staticmethod
  25. def _join(word: str, join_str: str = "_"):
  26. return join_str.join(pypinyin.lazy_pinyin(word))

问题

  1. 不同方法类的部分方法重复,没有较好的抽象出基类;
  2. 缺乏注释、说明和示例,不能较好的说明业务和方法。

BAD CODE-2

代码信息

项目: 北京网信办数据中心
归属人: 韩伟
业务逻辑: 从ES中读取批量数据并聚类相似事件
代码链接: https://git.datagrand.com/wbyy_group/bjwxb/bjwxb_alarm/tree/master/app/controller
位置: 全文件

代码片段

  1. class ThreadWithReturn(threading.Thread):
  2. """
  3. 定义带返回值的线程类
  4. Args:
  5. threading ([type]): [description]
  6. """
  7. def __init__(self, func, args=()):
  8. super(ThreadWithReturn, self).__init__()
  9. self.func = func
  10. self.args = args
  11. def run(self):
  12. self.result = self.func(*self.args)
  13. def get_result(self):
  14. try:
  15. return self.result
  16. except Exception:
  17. return None
  18. class SimiFinder:
  19. def __init__(self):
  20. self.es_recall = ESRecall()
  21. self.text_simi_ = TextSimi()
  22. @time_cost
  23. def simi_finder(self, text_list):
  24. try:
  25. info_dict = self.process_data(text_list)
  26. t_es = ThreadWithReturn(self._get_es_simi, args=(info_dict,))
  27. t_es.start()
  28. t_self_simi = ThreadWithReturn(
  29. self._get_self_simi, args=(info_dict,))
  30. t_self_simi.start()
  31. t_es.join()
  32. es_simi_dict = t_es.get_result()
  33. t_self_simi.join()
  34. self_simi_dict = t_self_simi.get_result()
  35. for i in range(len(text_list)):
  36. term_dict = text_list[i]
  37. item_id = term_dict['fields']['itemid']
  38. item_es_simi_list = es_simi_dict.get(item_id, [])
  39. item_self_simi_list = self_simi_dict.get(item_id, [])
  40. simi_result = item_es_simi_list + item_self_simi_list
  41. if simi_result:
  42. term_dict['fields'].setdefault('has_repeated', True)
  43. else:
  44. term_dict['fields'].setdefault('has_repeated', False)
  45. term_dict['fields'].setdefault('repeatedItems', simi_result)
  46. text_list[i] = term_dict
  47. except:
  48. import traceback
  49. logger.error('Get simi error: {}'.format(traceback.format_exc()))
  50. text_list=[]
  51. return text_list

问题

  1. 未发现使用在Flask内部使用线程的必要原因
  2. 函数间职责分离不够彻底
  3. 业务逻辑嵌套抽象不足,较难阅读

BAD CODE-3

代码信息

项目: 某情感分类项目
归属人: 房悦竹
业务逻辑: 获取评论内容的情感得分
代码链接: https://git.datagrand.com/wbyy_group/text_sentiment/blob/master/text_sentiment/src/sentiment_dict.py
位置: 88:207

代码片段

  1. ...
  2. # @timing
  3. def get_review_score(self, review):
  4. """
  5. 返回基于通用词典规则的评论得分
  6. :param review: 评论内容
  7. :return: review_score: [x, y], x-正面分, y-负面分, result: {0:x, 1:y}, x-负面分, y-正面分, hit_word_list:
  8. """
  9. sentences = tp.split_sentence(review)
  10. senti_score = []
  11. hit_word_list = []
  12. for sen in sentences:
  13. zhuanzhe_flag = 0 # set zhuanzhe flag
  14. # seg = self.wordseg.cut(sen, with_stop_word=False, model=None)
  15. seg = self.wordseg.lcut(sen)
  16. seg = process_contain_no(seg) # 特殊处理分词结果的“不”字
  17. cur_word_pos = 0 # current word position
  18. senti_word_pos = 0 # sentiment word position
  19. pos_score = 0
  20. neg_score = 0
  21. seg_len = len(seg)
  22. for word in seg:
  23. current_neg = 0
  24. current_pos = 0
  25. if word in self.zhuanzhe_dict:
  26. zhuanzhe_flag = self.zhuanzhe_dict[word]
  27. hit_word = 'zhuanzhe:' + word
  28. hit_word_list.append(hit_word)
  29. if word in self.pos_dict:
  30. hit_word = 'pos:' + word
  31. hit_word_list.append(hit_word)
  32. doubt_flag = False # 是否找到疑问词
  33. current_pos += 1
  34. start_pos = max(cur_word_pos - 5, senti_word_pos)
  35. for w in seg[start_pos:cur_word_pos]:
  36. current_pos = self.match_adv(w, current_pos, hit_word_list)
  37. if cur_word_pos != 0:
  38. s_idx = max([cur_word_pos - 2, 0]) # 往前找两个词
  39. for w in seg[s_idx:cur_word_pos]:
  40. current_pos, doubt_flag, hit_word_list = self.match_doubt_forward(w, current_pos,
  41. hit_word_list)
  42. if doubt_flag:
  43. break
  44. if not doubt_flag: # 找到疑问词后不再向后寻找
  45. if cur_word_pos < seg_len - 1:
  46. e_idx = min([cur_word_pos + 3, seg_len]) # 往后寻找两个词
  47. for w in seg[cur_word_pos + 1:e_idx]:
  48. current_pos, doubt_flag, hit_word_list = self.match_doubt_backward(w, current_pos,
  49. hit_word_list)
  50. if doubt_flag:
  51. break
  52. pos_score += current_pos
  53. senti_word_pos = cur_word_pos
  54. elif word in self.neg_dict:
  55. hit_word = 'neg:' + word
  56. hit_word_list.append(hit_word)
  57. doubt_flag = False # 是否找到疑问词
  58. current_neg += 1
  59. start_pos = max(cur_word_pos - 5, senti_word_pos)
  60. for w in seg[start_pos:cur_word_pos]:
  61. current_neg = self.match_adv(w, current_neg, hit_word_list)
  62. if cur_word_pos != 0:
  63. s_idx = max([cur_word_pos - 2, 0]) # 往前找两个词
  64. for w in seg[s_idx:cur_word_pos]:
  65. current_neg, doubt_flag, hit_word_list = self.match_doubt_forward(w, current_neg,
  66. hit_word_list)
  67. if doubt_flag:
  68. break
  69. if not doubt_flag: # 找到疑问词后不再向后寻找
  70. if cur_word_pos < seg_len - 1:
  71. e_idx = min([cur_word_pos + 3, seg_len]) # 往后寻找两个词
  72. for w in seg[cur_word_pos + 1:e_idx]:
  73. current_neg, doubt_flag, hit_word_list = self.match_doubt_backward(w, current_neg,
  74. hit_word_list)
  75. if doubt_flag:
  76. break
  77. neg_score += current_neg
  78. senti_word_pos = cur_word_pos
  79. elif word in self.insult_dict:
  80. hit_word = 'insult:' + word
  81. hit_word_list.append(hit_word)
  82. current_neg += 1 * 2 # 对在insult_dict中的词赋予2.0的权重
  83. neg_score += current_neg
  84. cur_word_pos += 1
  85. pos_score, neg_score = zhuanzhe_process(zhuanzhe_flag, pos_score, neg_score, senti_score)
  86. senti_score.append(transform_to_positive_num(pos_score, neg_score))
  87. # pay more attention to last sentence
  88. if senti_score[-1][0] > senti_score[-1][1]:
  89. senti_score[-1][0] *= 1.5
  90. if senti_score[-1][1] > senti_score[-1][0]:
  91. senti_score[-1][1] *= 1.5
  92. review_score = sum_sentences_score(senti_score) # [Pos, Neg]
  93. result = {}
  94. if review_score[0] == 0 and review_score[1] == 0:
  95. result[0] = 0.5
  96. result[1] = 0.5
  97. elif review_score[0] == 0: # 词典正面分为0
  98. result[0] = 0.75 # 原值为1,改为0.8
  99. result[1] = 0.25
  100. elif review_score[1] == 0: # 词典负面分为0
  101. result[0] = 0.25
  102. result[1] = 0.75
  103. else:
  104. temp_sum = review_score[0] + review_score[1]
  105. result[0] = round(review_score[1] / temp_sum, 2)
  106. result[1] = round(review_score[0] / temp_sum, 2)
  107. return review_score, result, hit_word_list
  108. ...

问题

  1. 函数代码过长,未实现良好抽象;
  2. 嵌套逻辑过多,导致可读性差和维护困难;
  3. 某些权重参数应做一定说明,描述定义依据。

GOODCODE-1

代码信息

项目: VOC核心
业务逻辑: 封装ES DSL
代码链接: https://git.datagrand.com/voc/voc_api/blob/master/app/common/es.py
位置: 25:368行

代码片段

  1. class ESQuery(object):
  2. def __init__(self, index: str = None, doc_type: str = None, bool: dict = None, collapse: dict = None,
  3. dsl: dict = None):
  4. """
  5. ES 查询dsl封装
  6. :param index: ES index
  7. :param doc_type: ES doc_type(如有)
  8. :param bool: bool dsl
  9. :param collapse: collapse dsl.折叠dsl
  10. :param dsl: source dsl.传入时直接生成dsl,和bool collapse等解析参数有所出入
  11. example:
  12. # [build condition]
  13. filter_condition = Query.term(field="is_match", value=True)
  14. agg_condition = [Aggs.terms(name="group", field="website.keyword", size=500)]
  15. # [input query]
  16. query = ESQuery(index="index_1", doc_type="type_1", bool=bools(filter=filter_condition))
  17. query.aggs(condition=agg_condition)
  18. # [modify attr]
  19. query.size(0)
  20. query.sort("datetime:desc")
  21. # [get result]
  22. query_result, _ = query.search()
  23. """
  24. if dsl and bool:
  25. raise ValueError("不允许同时传入'dsl' 'bool'参数")
  26. if dsl and collapse:
  27. raise ValueError("不允许同时传入'dsl' 'collapse'参数")
  28. self.index = index
  29. self.doc_type = doc_type
  30. self.params = {}
  31. self.bool = bool
  32. self.dsl = dsl if dsl is not None else {}
  33. if bool:
  34. self.dsl["query"] = bool
  35. if collapse:
  36. self.dsl["collapse"] = collapse
  37. def bools(self, must: (dict, list) = None, filter: (dict, list) = None, should: (dict, list) = None,
  38. must_not: (dict, list) = None):
  39. """
  40. bool 条件
  41. :param must: must条件
  42. :param filter: filter条件
  43. :param should: should条件
  44. :param must_not: must not条件
  45. """
  46. for name, condition_info in (("must", must), ("filter", filter), ("should", should), ("must_not", must_not)):
  47. if self.dsl.get("query") is None:
  48. self.dsl["query"] = {"bool": {}}
  49. if self.dsl["query"].get("bool") is None:
  50. self.dsl["query"]["bool"] = {}
  51. if condition_info is not None:
  52. condition = _parser_condition(condition_info)
  53. if name not in self.dsl["query"]["bool"]:
  54. self.dsl["query"]["bool"][name] = condition
  55. else:
  56. self.dsl["query"]["bool"][name].extend(condition)
  57. def aggs(self, condition):
  58. if isinstance(condition, list):
  59. condition = {k: v for c in condition for k, v in c.items()}
  60. self.dsl["aggs"] = condition
  61. def range(self, range_condition: dict):
  62. """
  63. 区间条件配置方法 对应DSL _source
  64. ES文档: [https://www.elastic.co/guide/en/elasticsearch/reference/5.5/query-dsl-range-query.html]
  65. :param range_condition: 区间条件
  66. """
  67. if self.dsl.get("query") is None:
  68. self.dsl["query"] = {"bool": {}}
  69. if self.dsl["query"].get("bool") is None:
  70. self.dsl["query"]["bool"] = {}
  71. if self.dsl["query"]["bool"].get("must") is None:
  72. self.dsl["query"]["bool"]["must"] = range_condition
  73. return
  74. for i, condition in enumerate(self.dsl["query"]["bool"]["must"]):
  75. if "range" in condition.keys():
  76. self.dsl["query"]["bool"]["must"][i] = range_condition
  77. return
  78. self.dsl["query"]["bool"]["must"].append(range_condition)
  79. def source(self, include: (str, list) = None, exclude: (str, list) = None):
  80. """
  81. 字段配置方法 对应DSL _source
  82. ES文档: [https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-request-source-filtering.html]
  83. :param include: 保留字段
  84. :param exclude: 不保留字段
  85. """
  86. self.dsl["_source"] = {}
  87. if include:
  88. self.dsl["_source"]["includes"] = include
  89. if exclude:
  90. self.dsl["_source"]["excludes"] = exclude
  91. def pagination(self, page: int = 1, per_page: int = 20):
  92. """
  93. 分页方法 对应DSL from/size
  94. ES文档: [https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-request-from-size.html]
  95. :param page: 页数,从1开始,默认1
  96. :param per_page: 页容量,默认20
  97. """
  98. if page is None:
  99. page = 1
  100. if per_page is None:
  101. per_page = 20
  102. from_ = (page - 1) * per_page
  103. self.dsl["from"] = from_
  104. self.dsl["size"] = per_page
  105. def sort(self, condition: (str, dict)):
  106. """
  107. 排序方法,对应DSL sort
  108. ES文档: [https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-request-sort.html]
  109. 注意此方法是对query对象的排序方法
  110. :param condition: 排序条件
  111. """
  112. if isinstance(condition, dict):
  113. condition = ",".join(("{}:{}".format(c_k, c_v) for c_k, c_v in condition.items()))
  114. self.params["sort"] = condition
  115. def size(self, num: int):
  116. """
  117. 单容量方法,对应DSL size
  118. 注意和pagination有冲突
  119. :param num: 容量
  120. """
  121. self.dsl.update({"size": num})
  122. def scroll(self, scroll: str):
  123. """
  124. 滚动方法 对应DSL scroll
  125. ES文档: [https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-request-scroll.html]
  126. :param scroll: 滚动ID
  127. """
  128. self.params.update({"scroll": scroll})
  129. def search(self, debug=False):
  130. """
  131. search方法 对应ES request body search方法
  132. ES文档: [https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-uri-request.html]
  133. :param debug: 是否打印(info日志)DSL request body
  134. """
  135. if debug:
  136. logger.info(json.dumps(self.dsl, ensure_ascii=False, indent=4))
  137. if self.dsl.get("size") is None:
  138. self.size(50)
  139. result = es.search(body=self.dsl, index=self.index, doc_type=self.doc_type, params=self.params)
  140. result.update({"pager_info": self._get_page_info(result)})
  141. return result
  142. def _get_page_info(self, result: dict):
  143. """
  144. 分页信息方法,用于返回分页相关情况说明
  145. :param result: ES查询结果
  146. :return: 分页相关信息
  147. """
  148. pager_info = {}
  149. if self.dsl.get("from") is not None:
  150. pager_info["page"] = self.dsl["from"]
  151. pager_info["per_page"] = self.dsl["size"]
  152. pager_info["total"] = result["hits"]["total"]
  153. pager_info["pages"] = math.ceil(pager_info["total"] / pager_info["per_page"])
  154. if self.params.get("sort") is not None:
  155. pager_info["order_by"] = self.params["sort"]
  156. return pager_info
  157. def clone(self):
  158. return ESQuery(index=self.index, doc_type=self.doc_type, dsl=deepcopy(self.dsl))

亮点

统一封装了ES DSL,避免操作字典描述逻辑,更具可读性。

GOODCODE-2

代码信息

项目: BERT文本相似度
业务逻辑: 推断文本相似度业务方法
代码链接: https://git.datagrand.com/wbyy_group/text_simi_bert/blob/master/app/src/text_simi.py
位置: 79:120行

代码片段

  1. def get_simi(self, query_list, candidate_list=None, simi='bert'):
  2. """
  3. 返回相似度
  4. :param self_simi: 是否自相似
  5. :param query_list: query 文本list
  6. :param candidate_list: 待相似文本list
  7. :return: 两个list的相似度,[len(query_list), len(candidate_list)]
  8. """
  9. if candidate_list:
  10. corpus_list = query_list + candidate_list
  11. else:
  12. corpus_list = query_list
  13. if simi != 'bert':
  14. # 分词
  15. feature_list = [[word for word in jieba.cut(text) if word not in self.stop_word_dict] for text in corpus_list]
  16. # 建立字典
  17. dictionary = Dictionary(feature_list)
  18. corpus_features = [dictionary.doc2bow(
  19. feature) for feature in feature_list]
  20. # 构建tfidf
  21. tf_idf_model = TfidfModel(corpus_features)
  22. query_tfidf = tf_idf_model[corpus_features[:len(query_list)]]
  23. candidate_tfidf = tf_idf_model[corpus_features[len(query_list):]]
  24. # 计算相似度
  25. sparse_matrix = SparseMatrixSimilarity(
  26. candidate_tfidf, len(dictionary.token2id))
  27. simi_result = sparse_matrix.get_similarities(query_tfidf)
  28. else:
  29. query_vec = self._get_vecs(query_list)
  30. if candidate_list:
  31. corpus_vec = self._get_vecs(candidate_list)
  32. else:
  33. corpus_vec = query_vec
  34. label_vec = query_vec / (query_vec ** 2).sum(axis=1, keepdims=True) ** 0.5
  35. corpus_list = corpus_vec / (corpus_vec ** 2).sum(axis=1, keepdims=True) ** 0.5
  36. simi_result = np.dot(label_vec, corpus_list.T)
  37. return simi_result

亮点

  1. 逻辑清晰,分离美观,方法说明及注释比较到位
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注