[关闭]
@Team 2018-05-24T12:37:56.000000Z 字数 20180 阅读 2413

SSD源码解析

薛坤军


一.本文主要内容

Model

SSD模型采用VGG16作为基础网络结构(base network),在base network 之后添加了额外的网络结构,如下图所示:

image.png-81.4kB

1. Multi-scale feature maps for detection

  • 在base network(VGG16的前5层)之后添加了额外的卷基层,具体利用astrous算法将fc6和fc7层转化为两个卷积层,再额外增加3个卷基层(Conv:1*1+Conv:3*3)和一个平均池化层(Avg Pooling,论文中是一个Conv:1*1+Conv:3*3,具有相同作用);

  • 这里我们在网络的所有特征图上应用3*3卷积进行预测,来自较低层的预测有助于处理较小的物体。因为低层的feature map的感受野较小。这意味着可以通过使用与感受野大小相似的feature map来处理大小不同的对象,即达到多尺度特征图检测的目的;

  • 关键代码解析:

  1. #部分初始化参数
  2. class SSDNet(object):
  3. """Implementation of the SSD VGG-based 300 network.
  4. The default features layers with 300x300 image input are:
  5. 多尺度feature map检测位置
  6. conv4 ==> 38 x 38
  7. conv7 ==> 19 x 19
  8. conv8 ==> 10 x 10
  9. conv9 ==> 5 x 5
  10. conv10 ==> 3 x 3
  11. conv11 ==> 1 x 1
  12. The default image size used to train this network is 300x300.
  13. """
  14. default_params = SSDParams(
  15. img_shape=(300, 300),#输入图像尺寸
  16. num_classes=21,#类别数量,20+1(背景)
  17. no_annotation_label=21,
  18. #多尺度feature map检测位置
  19. feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'],
  20. #feature map尺寸
  21. feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],
  22. #最低层、最高层default box大小,可根据需要进行修改
  23. anchor_size_bounds=[0.15, 0.90],
  24. #anchor_size_bounds=[0.20, 0.90],(原论文中的值)
  25. #default box大小
  26. anchor_sizes=[(21., 45.),
  27. (45., 99.),
  28. (99., 153.),
  29. (153., 207.),
  30. (207., 261.),
  31. (261., 315.)],
  32. # anchor_sizes=[(30., 60.),
  33. # (60., 111.),
  34. # (111., 162.),
  35. # (162., 213.),
  36. # (213., 264.),
  37. # (264., 315.)],
  38. #default box的长宽比例
  39. anchor_ratios=[[2, .5],
  40. [2, .5, 3, 1./3],
  41. [2, .5, 3, 1./3],
  42. [2, .5, 3, 1./3],
  43. [2, .5],
  44. [2, .5]],
  45. #default box中心位置间隔
  46. anchor_steps=[8, 16, 32, 64, 100, 300],
  47. anchor_offset=0.5,#补偿阈值
  48. #该特征图是否进行正则化,大于0正则化
  49. normalizations=[20, -1, -1, -1, -1, -1],
  50. prior_scaling=[0.1, 0.1, 0.2, 0.2]
  51. )
  52. #定义SSD网络结构
  53. def ssd_net(input,
  54. num_classes=SSDNet.default_params.num_classes,
  55. feat_layers=SSDNet.default_params.feat_layers,
  56. anchor_sizes=SSDNet.default_params.anchor_sizes,
  57. anchor_ratios=SSDNet.default_params.anchor_ratios,
  58. normalizations=SSDNet.default_params.normalizations,
  59. is_training=True,
  60. dropout_keep_prob=0.5,
  61. prediction_fn=slim.softmax,
  62. reuse=None,
  63. scope='ssd_300_vgg'):
  64. """SSD net definition."""
  65. # End_points collect relevant activations for external use.
  66. #存储每层feature map的输出结果
  67. end_points = {}
  68. with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
  69. # ========Original VGG-16 blocks========
  70. net = slim.repeat(input, 2, slim.conv2d, 64, [3, 3], scope='conv1')
  71. end_points['block1'] = net
  72. net = slim.max_pool2d(net, [2, 2], scope='pool1', padding='SAME')
  73. # Block 2.
  74. net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
  75. end_points['block2'] = net
  76. net = slim.max_pool2d(net, [2, 2], scope='pool2', padding='SAME')
  77. # Block 3.
  78. net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
  79. end_points['block3'] = net
  80. net = slim.max_pool2d(net, [2, 2], scope='pool3', padding='SAME')
  81. # Block 4.
  82. net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
  83. #第一个用于预测的feature map,shape为(batch_size, 38, 38, 512)
  84. end_points['block4'] = net
  85. net = slim.max_pool2d(net, [2, 2], scope='pool4', padding='SAME')
  86. # Block 5.
  87. net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
  88. end_points['block5'] = net
  89. net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5', padding='SAME')
  90. # Additional SSD blocks.
  91. # Block 6: let's dilate the hell out of it!
  92. net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6')
  93. end_points['block6'] = net
  94. net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)
  95. # Block 7: 1x1 conv. Because the fuck.
  96. net = slim.conv2d(net, 1024, [1, 1], scope='conv7')
  97. #第二个用于预测的feature map,shape为(batch_size, 19, 19, 1024)
  98. end_points['block7'] = net
  99. net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)
  100. # Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts)
  101. end_point = 'block8'
  102. with tf.variable_scope(end_point):
  103. net = slim.conv2d(net, 256, [1, 1], scope='conv1x1')
  104. net = custom_layers.pad2d(net, pad=(1, 1))
  105. net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID')
  106. #第三个用于预测的feature map,shape为(batch_size, 10, 10, 512)
  107. end_points[end_point] = net
  108. end_point = 'block9'
  109. with tf.variable_scope(end_point):
  110. net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
  111. net = custom_layers.pad2d(net, pad=(1, 1))
  112. net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID')
  113. #第四个用于预测的feature map,shape为(batch_size, 5, 5, 256)
  114. end_points[end_point] = net
  115. end_point = 'block10'
  116. with tf.variable_scope(end_point):
  117. net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
  118. net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
  119. #第五个用于预测的feature map,shape为(batch_size, 3, 3, 256)
  120. end_points[end_point] = net
  121. end_point = 'block11'
  122. with tf.variable_scope(end_point):
  123. net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
  124. net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
  125. #第六个用于预测的feature map,shape为(batch_size, 1, 1, 256)
  126. end_points[end_point] = net
  127. # Prediction and localisations layers.
  128. predictions = []
  129. logits = []
  130. localisations = []
  131. for i, layer in enumerate(feat_layers):
  132. with tf.variable_scope(layer + '_box'):
  133. #预测bbox的位置(相对于default box的偏移)以及类别
  134. p, l = ssd_multibox_layer(end_points[layer],
  135. num_classes,
  136. anchor_sizes[i],
  137. anchor_ratios[i],
  138. normalizations[i])
  139. #softmax
  140. predictions.append(prediction_fn(p))
  141. #类别概率
  142. logits.append(p)
  143. #bbox相对于default box的偏移
  144. localisations.append(l)
  145. return predictions, localisations, logits, end_points
  146. ssd_net.default_image_size = 300
测试使用的是tf-1.1.0版本,使用300*300的图片feature map的shape和预期不一样,因此在源码中做了改动,即在max_pool添加参数padding='SAME'。

2. Convolutional predictors for detection

  • 每一个用于预测的特征层(base network之后的feature map),使用一系列 convolutional filters,产生一系列固定大小(即每个特征图预测的尺度是固定的)的 predictions。对于一个 m×n,具有 p 通道的feature map,使用的convolutional filters 是 3×3 的 kernels。预测default box的类别和偏移位置;
  • YOLO 则是用一个全连接层来代替这里的卷积层,全连接层导致输入大小必须固定;
  • 关键代码分析:
  1. ##在特征图上进行预测(偏移位置,类别概率)
  2. """
  3. inpouts:['block4', 'block7', 'block8', 'block9', 'block10', 'block11']
  4. num_classes:21
  5. sizes:[(21.,45.),(45.,99.),(99.,153.), (153.,207.),(207.,261.),(261.,315.)]
  6. ratios:
  7. [[2, .5],[2, .5, 3, 1./3],[2, .5, 3, 1./3],[2, .5, 3, 1./3],[2, .5],[2,.5]]
  8. 参数一一对应
  9. """
  10. def ssd_multibox_layer(inputs,
  11. num_classes,
  12. sizes,
  13. ratios=[1],
  14. normalization=-1,
  15. bn_normalization=False):
  16. """Construct a multibox layer, return a class and localization predictions.
  17. """
  18. net = inputs
  19. #正则化
  20. if normalization > 0:
  21. net = custom_layers.l2_normalization(net, scaling=True)
  22. # Number of anchors.
  23. #此feature map每个位置对应的default box个数
  24. #len(size)表示长宽比例为1的的个数
  25. #len(ratios)表示其它长宽比例
  26. num_anchors = len(sizes) + len(ratios)
  27. # Location.
  28. #位置
  29. num_loc_pred = num_anchors * 4
  30. #卷积预测器,为每个bbox预测位置
  31. """输出:
  32. (batch_size, 38, 38,num_loc_pred)
  33. (batch_size, 19, 19,num_loc_pred)
  34. (batch_size, 10, 10,num_loc_pred)
  35. (batch_size, 5, 5,num_loc_pred)
  36. (batch_size, 3, 3,num_loc_pred)
  37. (batch_size, 1, 1,num_loc_pred)
  38. """
  39. loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None,
  40. scope='conv_loc')
  41. loc_pred = custom_layers.channel_to_last(loc_pred)
  42. loc_pred = tf.reshape(loc_pred,
  43. tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4])
  44. # Class prediction.
  45. #卷积预测器,为每个bbox预测类别
  46. num_cls_pred = num_anchors * num_classes
  47. cls_pred = slim.conv2d(net,
  48. num_cls_pred,
  49. [3, 3],
  50. activation_fn=None,
  51. scope='conv_cls')
  52. cls_pred = custom_layers.channel_to_last(cls_pred)
  53. cls_pred = tf.reshape(cls_pred,
  54. tensor_shape(cls_pred, 4)[:-1]+[num_anchors, num_classes])
  55. return cls_pred, loc_pred

3. Default boxes and aspect ratios(长宽比)

  • 在每一个用于预测的feature map上得到default boxes,default boxes的数量、尺寸、长宽比由网络结构固定而固定;
  • 关键代码解析:
  1. #为特征每个feature map生成固定的default box
  2. def ssd_anchor_one_layer(img_shape,
  3. feat_shape,
  4. sizes,
  5. ratios,
  6. step,
  7. offset=0.5,
  8. dtype=np.float32):
  9. """Computer SSD default anchor boxes for one feature layer.
  10. Determine the relative position grid of the centers, and the relative
  11. width and height.
  12. Arguments:
  13. feat_shape: Feature shape, used for computing relative position grids;
  14. size: Absolute reference sizes;
  15. ratios: Ratios to use on these features;
  16. img_shape: Image shape, used for computing height, width relatively to the
  17. former;
  18. offset: Grid offset.
  19. Return:
  20. y, x, h, w: Relative x and y grids, and height and width.
  21. """
  22. # Compute the position grid: simple way.
  23. # y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
  24. # y = (y.astype(dtype) + offset) / feat_shape[0]
  25. # x = (x.astype(dtype) + offset) / feat_shape[1]
  26. # Weird SSD-Caffe computation using steps values...
  27. #以(38*38)的feature map为例生成default box
  28. #理解为feature map对应的y轴坐标,x轴坐标
  29. """
  30. y的shape(38,38),值为:
  31. np.array([[0,0,0,...,0,0,0],
  32. [1,1,1,...,1,1,1],
  33. ......
  34. [37,37,37,...,37,37,37]])
  35. x的shape(38,38),值为:
  36. np.array([[0,1,2,...,35,36,37],
  37. [0,1,2,...,35,36,37],
  38. ......
  39. [0,1,2,...,35,36,37]])
  40. """
  41. y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
  42. #将feature map的点对应到原始图像上并归一化[0-1]
  43. #y = (y + 0.5) * 8/300
  44. #x = (x + 0.5) * 8/300
  45. #x,y为default box在原始图片中的中心位置,并归一化[0-1]
  46. y = (y.astype(dtype) + offset) * step / img_shape[0]
  47. x = (x.astype(dtype) + offset) * step / img_shape[1]
  48. # Expand dims to support easy broadcasting.
  49. #扩展维度,shape为(38,38,1)
  50. y = np.expand_dims(y, axis=-1)
  51. x = np.expand_dims(x, axis=-1)
  52. # Compute relative height and width.
  53. # Tries to follow the original implementation of SSD for the order.
  54. #anchors的数量
  55. #feature map每个点对应的default box 的数量
  56. num_anchors = len(sizes) + len(ratios)
  57. #default box 的高和宽
  58. h = np.zeros((num_anchors, ), dtype=dtype)
  59. w = np.zeros((num_anchors, ), dtype=dtype)
  60. # Add first anchor boxes with ratio=1.
  61. #
  62. #长宽比例为1的default box,高和宽都为21/300
  63. h[0] = sizes[0] / img_shape[0]
  64. w[0] = sizes[0] / img_shape[1]
  65. di = 1
  66. #长宽比例为1的default box额外添加一个尺寸为sqrt(Sk*Sk+1)的default box
  67. if len(sizes) > 1:
  68. #宽高都为sqrt(21*45)
  69. h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]
  70. w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
  71. di += 1
  72. #剩余长宽比的default box
  73. for i, r in enumerate(ratios):
  74. h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
  75. w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
  76. #返回default box的中心位置以及宽和高
  77. #y,x的shape为(38,38,1)
  78. #h,w的shape为(4,)
  79. return y, x, h, w
  80. def ssd_anchors_all_layers(img_shape,#原始图像的shape
  81. layers_shape,#特征图shape
  82. anchor_sizes,#default box尺寸
  83. anchor_ratios,#长宽比例
  84. anchor_steps,
  85. offset=0.5,
  86. dtype=np.float32):
  87. """Compute anchor boxes for all feature layers."""
  88. """
  89. params:
  90. img_shape: (300,300)
  91. layers_shape: [(38,38),(19,19),(10,10),(5,5),(3,3),(1,1)]
  92. 21,45,99,153,207,261
  93. anchor_sizes: [(21,45),(45,99),(99,153),(153,207),(207,261),(261,315)]
  94. anchor_ratios:[[2,.5],[2,.5,3,1./3],[2,.5,3,1./3],[2,.5,3,1./3],[2,.5],[2,.5]]
  95. anchor_steps: [8,16,32,64,100,300]
  96. offset: 0.5
  97. """
  98. layers_anchors = []
  99. #enumerate,python的内置函数返回索引、内容
  100. """
  101. 即:
  102. 0,(38,38)
  103. 1,(19,19)
  104. 2,(10,10)
  105. 3,(5,5)
  106. 4,(3,3)
  107. 5,(1,1)
  108. """
  109. for i, s in enumerate(layers_shape):
  110. anchor_bboxes = ssd_anchor_one_layer(img_shape, s,
  111. anchor_sizes[i],
  112. anchor_ratios[i],
  113. anchor_steps[i],
  114. offset=offset,
  115. dtype=dtype)
  116. layers_anchors.append(anchor_bboxes)
  117. return layers_anchors
训练

1. 生成default box

  • 对每种尺寸的feature map,按照相应的大小(scale)和宽高比例(ratio)在每个点生成固定数量的default box,也就是说,SSD中的default box是由网络结构固定而固定的,如下图(仅仅是为了举例),红色点代表feature map(5*5),每个位置预测3个default box,尺寸为168,宽高比为1,1/2,2,则default box宽高分别为([168,168], [],[ ]);
image.png-163.3kB
  • 生成default box:
    • 首先设计出最小和最大default box的尺寸[ ],即越底层的feature map对应的default box尺寸越小(感受野越小,更适合检测小尺寸对象),论文中为[0.2,0.9],上述代码中为[0.15,0.9];
    • 每个feature map(由低层到高层)对应的default box的尺寸计算公式为: 为feature map数量;
    • 每个尺寸的default box宽高根据比例值计算,如下所示:
      宽:,高:为default box尺寸;
      比例为1的默认框,额外添加一个尺寸为的default box;
      每个默认框中心设定为为特征图尺寸;

2. 生成训练数据

  • 根据图片的ground truth和default box生成训练数据,关键代码解析如下:
  1. #gt编码函数
  2. #labels:gt的类别
  3. #bboxes:gt的位置
  4. #anchors:default box的位置
  5. #num_class:类别数量
  6. #no_annotation_label:21
  7. #ignore_threshold=0.5,阈值
  8. #prior_scaling=[0.1, 0.1, 0.2, 0.2],缩放
  9. def tf_ssd_bboxes_encode(labels, bboxes, anchors, num_classes,
  10. no_annotation_label, ignore_threshold=0.5,
  11. prior_scaling=[0.1, 0.1, 0.2, 0.2],
  12. dtype=tf.float32, scope='ssd_bboxes_encode'):
  13. """Encode groundtruth labels and bounding boxes using SSD net anchors.
  14. Encoding boxes for all feature layers.
  15. Arguments:
  16. labels: 1D Tensor(int64) containing groundtruth labels;
  17. bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
  18. anchors: List of Numpy array with layer anchors;
  19. matching_threshold: Threshold for positive match with groundtruth bboxes;
  20. prior_scaling: Scaling of encoded coordinates.
  21. Return:
  22. (target_labels, target_localizations, target_scores):
  23. Each element is a list of target Tensors.
  24. """
  25. with tf.name_scope(scope):
  26. target_labels = []
  27. target_localizations = []
  28. target_scores = []
  29. for i, anchors_layer in enumerate(anchors):
  30. with tf.name_scope('bboxes_encode_block_%i' % i):
  31. #处理每个尺寸的default box(对应一层的feature map),生成训练数据
  32. t_labels, t_loc, t_scores = \
  33. tf_ssd_bboxes_encode_layer(labels, bboxes,
  34. anchors_layer,
  35. num_classes,
  36. no_annotation_label,
  37. ignore_threshold,
  38. prior_scaling, dtype)
  39. target_labels.append(t_labels)
  40. target_localizations.append(t_loc)
  41. target_scores.append(t_scores)
  42. return target_labels, target_localizations, target_scores
处理每个尺寸的default box(对应一层的feature map),生成训练数据,关键代码解析,以shape为(38,38)feature map为例:
  • 本代码块中对于每一个anchor和所有的gt计算重叠度,anchor的类别为重叠度最高的gt的类别,偏移位置为相对于重叠度最高的gt的偏移位置;
  • 给定输入图像以及每个物体的 ground truth,首先找到每个gt对应的default box中重叠度最大的作为(与该ground true box相关的匹配)正样本。然后,在剩下的default box中找到那些与任意一个ground truth box 的 IOU 大于 0.5的default box作为(与该ground true box相关的匹配)正样本。剩余的default box 作为负例样本;
  • 一个anchor对应一个gt,而一个gt可能对应多个anchor;
  1. #labels:gt的类别
  2. #bboxes:gt的位置
  3. #anchors_layer:特定feature map的default box的位置
  4. #num_class:类别数量
  5. #no_annotation_label:21
  6. #ignore_threshold=0.5,阈值
  7. #prior_scaling=[0.1, 0.1, 0.2, 0.2],缩放
  8. def tf_ssd_bboxes_encode_layer(labels,
  9. bboxes,
  10. anchors_layer,
  11. num_classes,
  12. no_annotation_label,
  13. ignore_threshold=0.5,
  14. prior_scaling=[0.1, 0.1, 0.2, 0.2],
  15. dtype=tf.float32):
  16. """Encode groundtruth labels and bounding boxes using SSD anchors from
  17. one layer.
  18. Arguments:
  19. labels: 1D Tensor(int64) containing groundtruth labels;
  20. bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
  21. anchors_layer: Numpy array with layer anchors;
  22. matching_threshold: Threshold for positive match with groundtruth bboxes;
  23. prior_scaling: Scaling of encoded coordinates.
  24. Return:
  25. (target_labels, target_localizations, target_scores): Target Tensors.
  26. """
  27. # Anchors coordinates and volume.
  28. #anchors的中心坐标,以及宽高
  29. #shape为(38,38,1),(38,38,1),(4,),(4,)
  30. yref, xref, href, wref = anchors_layer
  31. ymin = yref - href / 2.#anchor的下边界,(38,38,4)
  32. xmin = xref - wref / 2.#anchor的左边界,(38,38,4)
  33. ymax = yref + href / 2.#anchor的上边界,(38,38,4)
  34. xmax = xref + wref / 2.#anchor的右边界,(38,38,4)
  35. vol_anchors = (xmax - xmin) * (ymax - ymin)#anchor的面积,(38,38,4)
  36. # Initialize tensors...
  37. #(38,38,4)
  38. shape = (yref.shape[0], yref.shape[1], href.size)
  39. feat_labels = tf.zeros(shape, dtype=tf.int64)
  40. feat_scores = tf.zeros(shape, dtype=dtype)
  41. feat_ymin = tf.zeros(shape, dtype=dtype)
  42. feat_xmin = tf.zeros(shape, dtype=dtype)
  43. feat_ymax = tf.ones(shape, dtype=dtype)
  44. feat_xmax = tf.ones(shape, dtype=dtype)
  45. #计算jaccard重合度
  46. #box存储的是gt的四个边界位置,并且都进行了归一化
  47. def jaccard_with_anchors(bbox):
  48. """Compute jaccard score between a box and the anchors.
  49. """
  50. #获取gt和anchors重合的部分
  51. int_ymin = tf.maximum(ymin, bbox[0])
  52. int_xmin = tf.maximum(xmin, bbox[1])
  53. int_ymax = tf.minimum(ymax, bbox[2])
  54. int_xmax = tf.minimum(xmax, bbox[3])
  55. h = tf.maximum(int_ymax - int_ymin, 0.)
  56. w = tf.maximum(int_xmax - int_xmin, 0.)
  57. # Volumes.
  58. inter_vol = h * w#计算重叠部分面积
  59. union_vol = vol_anchors - inter_vol \
  60. + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
  61. jaccard = tf.div(inter_vol, union_vol)
  62. return jaccard#返回重合度
  63. #计算重叠部分面积占anchor面积的比例
  64. def intersection_with_anchors(bbox):
  65. """Compute intersection between score a box and the anchors.
  66. """
  67. int_ymin = tf.maximum(ymin, bbox[0])
  68. int_xmin = tf.maximum(xmin, bbox[1])
  69. int_ymax = tf.minimum(ymax, bbox[2])
  70. int_xmax = tf.minimum(xmax, bbox[3])
  71. h = tf.maximum(int_ymax - int_ymin, 0.)
  72. w = tf.maximum(int_xmax - int_xmin, 0.)
  73. inter_vol = h * w
  74. scores = tf.div(inter_vol, vol_anchors)
  75. return scores
  76. #tf.while_loop的条件
  77. def condition(i, feat_labels, feat_scores,
  78. feat_ymin, feat_xmin, feat_ymax, feat_xmax):
  79. """Condition: check label index.
  80. """
  81. #返回I<tf.shape(labels)是否为真
  82. r = tf.less(i, tf.shape(labels))
  83. return r[0]
  84. #tf.while_loop的主体
  85. def body(i, feat_labels, feat_scores,
  86. feat_ymin, feat_xmin, feat_ymax, feat_xmax):
  87. """Body: update feature labels, scores and bboxes.
  88. Follow the original SSD paper for that purpose:
  89. - assign values when jaccard > 0.5;
  90. - only update if beat the score of other bboxes.
  91. """
  92. # Jaccard score.
  93. #第i个gt的类别和位置
  94. label = labels[i]
  95. bbox = bboxes[i]
  96. #计算gt和每一个anchor的重合度
  97. jaccard = jaccard_with_an4chors(bbox)
  98. # Mask: check threshold + scores + no annotations + num_classes.
  99. #比较两个值的大小来输出对错,大于输出true,shape(38,38,4)
  100. #feat_scores存储的是anchor和gt重叠度最高的值
  101. mask = tf.greater(jaccard, feat_scores)
  102. #mask = tf.logical_and(mask,tf.greater(jaccard,matching_threshold))
  103. #逻辑与
  104. mask = tf.logical_and(mask, feat_scores > -0.5)
  105. mask = tf.logical_and(mask, label < num_classes)
  106. imask = tf.cast(mask, tf.int64)
  107. fmask = tf.cast(mask, dtype)
  108. # Update values using mask.
  109. #根据imask更新类别,和位置
  110. #imask表示本轮anchor和gt重合度之前gt的重合度,1-imask保留之前的结果
  111. #更新anchor的类别标签
  112. feat_labels = imask * label + (1 - imask) * feat_labels
  113. #jaccard返回true对应的值,feat_scores返回false对应的值
  114. #更新anchor与gt的重合度,为每个anchor保留重合度最大值
  115. feat_scores = tf.where(mask, jaccard, feat_scores)
  116. #更新anchor对应的gt(具有最大重合度)
  117. feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin
  118. feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
  119. feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
  120. feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax
  121. # Check no annotation label: ignore these anchors...
  122. # interscts = intersection_with_anchors(bbox)
  123. # mask = tf.logical_and(interscts > ignore_threshold,
  124. # label == no_annotation_label)
  125. # # Replace scores by -1.
  126. # feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)
  127. return [i+1, feat_labels, feat_scores,
  128. feat_ymin, feat_xmin, feat_ymax, feat_xmax]
  129. # Main loop definition.
  130. i = 0
  131. [i, feat_labels, feat_scores,
  132. feat_ymin, feat_xmin,
  133. feat_ymax, feat_xmax] = tf.while_loop(condition, body,
  134. [i, feat_labels, feat_scores,
  135. feat_ymin, feat_xmin,
  136. feat_ymax, feat_xmax])
  137. # Transform to center / size.
  138. #计算anchor对应的gt的中心位置以及宽和高
  139. feat_cy = (feat_ymax + feat_ymin) / 2.
  140. feat_cx = (feat_xmax + feat_xmin) / 2.
  141. feat_h = feat_ymax - feat_ymin
  142. feat_w = feat_xmax - feat_xmin
  143. # Encode features.
  144. #计算anchor与对应的gt的偏移位置
  145. feat_cy = (feat_cy - yref) / href / prior_scaling[0]
  146. feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
  147. feat_h = tf.log(feat_h / href) / prior_scaling[2]
  148. feat_w = tf.log(feat_w / wref) / prior_scaling[3]
  149. # Use SSD ordering: x / y / w / h instead of ours.
  150. feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)
  151. #返回每个anchor的类别标签,以及anchor和对应gt的偏移,anchor与对应gt的重合度
  152. return feat_labels, feat_localizations, feat_scores
3.损失函数

SSD损失函数分为两部分:

  • localization loss(loc)

  • confidence loss(conf)

定义, 表示 第 个 default box 与第 个 ground truth box 相匹配,类别为,若不匹配的话,值为0。

训练对象为:

  • 为匹配default box,
  • 为预测框和ground truth box ,定义如下:

image.png-39.1kB

  • 为预测框,为ground truth, 为defaultbox,我们对偏移位置进行回归。 为多类别softmax loss,定义如下, 通过交叉验证将设为1 :

image.png-25.9kB

  • 关键代码分析:
  1. #SSD损失函数定义
  2. #logits:预测的类别
  3. #localisations:预测的偏移位置
  4. #gclasses:default box相对于gt的类别
  5. #glocalisations:default box相对于gt的偏移位置
  6. #gscores:default box和gt的重叠度
  7. def ssd_losses(logits, localisations,
  8. gclasses, glocalisations, gscores,
  9. match_threshold=0.5,
  10. negative_ratio=3.,
  11. alpha=1.,
  12. label_smoothing=0.,
  13. device='/cpu:0',
  14. scope=None):
  15. with tf.name_scope(scope, 'ssd_losses'):
  16. lshape = tfe.get_shape(logits[0], 5)
  17. #类别数量
  18. num_classes = lshape[-1]
  19. batch_size = lshape[0]
  20. # Flatten out all vectors!
  21. flogits = []
  22. fgclasses = []
  23. fgscores = []
  24. flocalisations = []
  25. fglocalisations = []
  26. #处理所有尺寸feature map的预测结果
  27. #(38,38),(19,19),(10,10),(5,5),(3,3),(1,1)
  28. for i in range(len(logits)):
  29. #预测的类别(38*38*4, 21)
  30. flogits.append(tf.reshape(logits[i], [-1, num_classes]))
  31. #真实类别(38*38*4)
  32. fgclasses.append(tf.reshape(gclasses[i], [-1]))
  33. #重叠度(38*38*4)
  34. fgscores.append(tf.reshape(gscores[i], [-1]))
  35. #预测偏移位置,(38*38*4, 4)
  36. flocalisations.append(tf.reshape(localisations[i], [-1, 4]))
  37. #真实偏移位置,(38*38*4, 4)
  38. fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))
  39. # And concat the crap!
  40. logits = tf.concat(flogits, axis=0)
  41. gclasses = tf.concat(fgclasses, axis=0)
  42. gscores = tf.concat(fgscores, axis=0)
  43. localisations = tf.concat(flocalisations, axis=0)
  44. glocalisations = tf.concat(fglocalisations, axis=0)
  45. dtype = logits.dtype
  46. # Compute positive matching mask...
  47. #获取重叠度>0.5的default box个数,即损失函数中的N,正例样本位置
  48. pmask = gscores > match_threshold
  49. fpmask = tf.cast(pmask, dtype)
  50. n_positives = tf.reduce_sum(fpmask)
  51. # Hard negative mining...
  52. no_classes = tf.cast(pmask, tf.int32)
  53. #将输出类别对应的softmax
  54. predictions = slim.softmax(logits)
  55. #逻辑与,获得负类样本的位置
  56. nmask = tf.logical_and(tf.logical_not(pmask),
  57. gscores > -0.5)
  58. fnmask = tf.cast(nmask, dtype)
  59. #获得负例样本对应的概率
  60. nvalues = tf.where(nmask,
  61. predictions[:, 0],
  62. 1. - fnmask)
  63. nvalues_flat = tf.reshape(nvalues, [-1])
  64. # Number of negative entries to select.
  65. #负例样本数目,保证正负样本数目为1:3
  66. max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32)
  67. n_neg = tf.cast(negative_ratio * n_positives, tf.int32)+batch_size
  68. n_neg = tf.minimum(n_neg, max_neg_entries)
  69. val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)
  70. max_hard_pred = -val[-1]
  71. # Final negative mask.
  72. nmask = tf.logical_and(nmask, nvalues < max_hard_pred)
  73. fnmask = tf.cast(nmask, dtype)
  74. # Add cross-entropy loss.
  75. #正样本概率损失函数
  76. with tf.name_scope('cross_entropy_pos'):
  77. loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
  78. logits=logits,
  79. labels=gclasses)
  80. loss = tf.div(tf.reduce_sum(loss * fpmask),
  81. batch_size, name='value')
  82. tf.losses.add_loss(loss)
  83. #负样本概率损失函数
  84. with tf.name_scope('cross_entropy_neg'):
  85. loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
  86. logits=logits,
  87. labels=no_classes)
  88. loss = tf.div(tf.reduce_sum(loss * fnmask),
  89. batch_size, name='value')
  90. tf.losses.add_loss(loss)
  91. # Add localization loss: smooth L1, L2, ...
  92. #位置损失函数
  93. with tf.name_scope('localization'):
  94. # Weights Tensor: positive mask + random negative.
  95. weights = tf.expand_dims(alpha * fpmask, axis=-1)
  96. loss = custom_layers.abs_smooth(localisations - glocalisations)
  97. loss = tf.div(tf.reduce_sum(loss * weights),
  98. batch_size,
  99. name='value')
  100. tf.losses.add_loss(loss)

4. Hard Negative Mining

  • 绝大多数的default box都是负例样本,导致正负样本不平衡,训练时采用Hard Negative Mining策略(使正负样本比例为1:3)来平衡正负样本比例。

总结

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注