[关闭]
@awsekfozc 2016-01-09T09:20:14.000000Z 字数 7544 阅读 2248

OpenTSDB 监控报警接入

OpenTSDB

参考:http://www.ttlsa.com/opentsdb/opentsdb-nagios-monitoring-and-alarming-realization/

1)脚本说明

在OpenTSDB源码tools目录下有一个Python工具check_tsd。该脚本查询OpenTSDB并返回兼容Nagios的输出OK/WARNING/CRITICAL状态格式。

  1. #!/usr/bin/python
  2. #
  3. # 该脚本根据给定的metric查询TSDB,并与指定的阀值进行比较。
  4. # 兼容Nagios的输出格式,因此可以作为nagios的命令。
  5. #
  6. # check_tsd -m mysql.slave.seconds_behind_master -t host=foo -t schema=mydb
  7. # -d 600 -a avg -x gt -w 50 -c 100
  8. #
  9. import httplib
  10. import operator
  11. import socket
  12. import sys
  13. import time
  14. from optparse import OptionParser
  15. def main(argv):
  16. """从TSDB提取数据,并做简单的Nagios报警。"""
  17. parser = OptionParser(description='Simple TSDB data extractor for Nagios.')
  18. parser.add_option('-H', '--host', default='localhost', metavar='HOST',
  19. help='Hostname to use to connect to the TSD.')
  20. parser.add_option('-p', '--port', type='int', default=80, metavar='PORT',
  21. help='Port to connect to the TSD instance on.')
  22. parser.add_option('-m', '--metric', metavar='METRIC',
  23. help='Metric to query.')
  24. parser.add_option('-t', '--tag', action='append', default=[],
  25. metavar='TAG', help='Tags to filter the metric on.')
  26. parser.add_option('-d', '--duration', type='int', default=600,
  27. metavar='SECONDS', help='How far back to look for data.')
  28. parser.add_option('-D', '--downsample', default='none', metavar='METHOD',
  29. help='Downsample function, e.g. one of avg, min, sum, or max.')
  30. parser.add_option('-W', '--downsample-window', type='int', default=60,
  31. metavar='SECONDS', help='Window size over which to downsample.')
  32. parser.add_option('-a', '--aggregator', default='sum', metavar='METHOD',
  33. help='Aggregation method: avg, min, sum (default), max.')
  34. parser.add_option('-x', '--method', dest='comparator', default='gt',
  35. metavar='METHOD', help='Comparison method: gt, ge, lt, le, eq, ne.')
  36. parser.add_option('-r', '--rate', default=False,
  37. action='store_true', help='Use rate value as comparison operand.')
  38. parser.add_option('-w', '--warning', type='float', metavar='THRESHOLD',
  39. help='Threshold for warning. Uses the comparison method.')
  40. parser.add_option('-c', '--critical', type='float', metavar='THRESHOLD',
  41. help='Threshold for critical. Uses the comparison method.')
  42. parser.add_option('-v', '--verbose', default=False,
  43. action='store_true', help='Be more verbose.')
  44. parser.add_option('-T', '--timeout', type='int', default=10,
  45. metavar='SECONDS',
  46. help='How long to wait for the response from TSD.')
  47. parser.add_option('-E', '--no-result-ok', default=False,
  48. action='store_true',
  49. help='Return OK when TSD query returns no result.')
  50. parser.add_option('-I', '--ignore-recent', default=0, type='int',
  51. metavar='SECONDS', help='Ignore data points that are that'
  52. ' are that recent.')
  53. parser.add_option('-S', '--ssl', default=False, action='store_true',
  54. help='Make queries to OpenTSDB via SSL (https)')
  55. (options, args) = parser.parse_args(args=argv[1:])
  56. # 验证参数
  57. if options.comparator not in ('gt', 'ge', 'lt', 'le', 'eq', 'ne'):
  58. parser.error("Comparator '%s' not valid." % options.comparator)
  59. elif options.downsample not in ('none', 'avg', 'min', 'sum', 'max'):
  60. parser.error("Downsample '%s' not valid." % options.downsample)
  61. elif options.aggregator not in ('avg', 'min', 'sum', 'max'):
  62. parser.error("Aggregator '%s' not valid." % options.aggregator)
  63. elif not options.metric:
  64. parser.error('You must specify a metric (option -m).')
  65. elif options.duration <= 0:
  66. parser.error('Duration must be strictly positive.')
  67. elif options.downsample_window <= 0:
  68. parser.error('Downsample window must be strictly positive.')
  69. elif options.critical is None and options.warning is None:
  70. parser.error('You must specify at least a warning threshold (-w) or a'
  71. ' critical threshold (-c).')
  72. elif options.ignore_recent < 0:
  73. parser.error('--ignore-recent must be positive.')
  74. if not options.critical:
  75. options.critical = options.warning
  76. elif not options.warning:
  77. options.warning = options.critical
  78. # 处理标签
  79. tags = ','.join(options.tag)
  80. if tags:
  81. tags = '{' + tags + '}'
  82. # 组装URL并获取
  83. if options.downsample == 'none':
  84. downsampling = ''
  85. else:
  86. downsampling = '%ds-%s:' % (options.downsample_window,
  87. options.downsample)
  88. if options.rate:
  89. rate = 'rate:'
  90. else:
  91. rate = ''
  92. url = ('/q?start=%ss-ago&m=%s:%s%s%s%s&ascii&nagios'
  93. % (options.duration, options.aggregator, downsampling, rate,
  94. options.metric, tags))
  95. tsd = '%s:%d' % (options.host, options.port)
  96. if options.ssl: # Pick the class to instantiate first.
  97. conn = httplib.HTTPSConnection
  98. else:
  99. conn = httplib.HTTPConnection
  100. if sys.version_info[0] * 10 + sys.version_info[1] >= 26: # Python >2.6
  101. conn = conn(tsd, timeout=options.timeout)
  102. else: # Python 2.5 or less, using the timeout kwarg will make it croak :(
  103. conn = conn(tsd)
  104. try:
  105. conn.connect()
  106. except socket.error, e:
  107. print "ERROR: couldn't connect to %s: %s" % (tsd, e)
  108. return 2
  109. if options.verbose:
  110. peer = conn.sock.getpeername()
  111. print 'Connected to %s:%d' % (peer[0], peer[1])
  112. conn.set_debuglevel(1)
  113. now = int(time.time())
  114. try:
  115. conn.request('GET', url)
  116. res = conn.getresponse()
  117. datapoints = res.read()
  118. conn.close()
  119. except socket.error, e:
  120. print "ERROR: couldn't GET %s from %s: %s" % (url, tsd, e)
  121. return 2
  122. # URL请求失败时
  123. if res.status not in (200, 202):
  124. print ('CRITICAL: status = %d when talking to %s:%d'
  125. % (res.status, options.host, options.port))
  126. if options.verbose:
  127. print 'TSD said:'
  128. print datapoints
  129. return 2
  130. # URL请求成功时
  131. if options.verbose:
  132. print datapoints
  133. datapoints = datapoints.splitlines()
  134. def no_data_point():
  135. """从TSDB没有获取到任何数据时"""
  136. if options.no_result_ok:
  137. print 'OK: query did not return any data point (--no-result-ok)'
  138. return 0
  139. else:
  140. print 'CRITICAL: query did not return any data point'
  141. return 2
  142. if not len(datapoints):
  143. return no_data_point()
  144. comparator = operator.__dict__[options.comparator]
  145. rv = 0 # 该脚本返回值,0-OK,1-WARNING,2-CRITICAL
  146. badts = None # 超过阀值的时间戳
  147. badval = None # 超过阀值的值
  148. npoints = 0 # 查询到多少数据点?
  149. nbad = 0 # 有多少数据点超过阀值?
  150. for datapoint in datapoints:
  151. datapoint = datapoint.split()
  152. ts = int(datapoint[1])
  153. delta = now - ts
  154. if delta > options.duration or delta <= options.ignore_recent:
  155. continue # 忽略不在options.duration或options.ignore_recent的数据点
  156. npoints += 1
  157. val = datapoint[2]
  158. if '.' in val:
  159. val = float(val)
  160. else:
  161. val = int(val)
  162. bad = False # Is the current value bad?
  163. # 比较 warning/crit
  164. if comparator(val, options.critical):
  165. rv = 2
  166. bad = True
  167. nbad += 1
  168. elif rv < 2 and comparator(val, options.warning):
  169. rv = 1
  170. bad = True
  171. nbad += 1
  172. if (bad and
  173. (badval is None # First bad value we find.
  174. or comparator(val, badval))): # Worse value.
  175. badval = val
  176. badts = ts
  177. if options.verbose and len(datapoints) != npoints:
  178. print ('ignored %d/%d data points for being more than %ds old'
  179. % (len(datapoints) - npoints, len(datapoints), options.duration)) #忽略量/总查询量 数据点大于duration时间
  180. if not npoints:
  181. return no_data_point()
  182. if badts:
  183. if options.verbose:
  184. print 'worse data point value=%s at ts=%s' % (badval, badts) # 糟糕数据点的值以及该点值的时间戳
  185. badts = time.asctime(time.localtime(badts))
  186. # 在NRPE里,字符串'|'有特殊含义,但在tag搜索时有用到。对其替换。
  187. ttags = tags.replace("|",":")
  188. if not rv:
  189. print ('OK: %s%s: %d values OK, last=%r'
  190. % (options.metric, ttags, npoints, val)) # OK: metric{tags}: 数据点数量 values OK, 最后一次的值
  191. else:
  192. if rv == 1:
  193. level = 'WARNING'
  194. threshold = options.warning
  195. elif rv == 2:
  196. level = 'CRITICAL'
  197. threshold = options.critical
  198. print ('%s: %s%s %s %s: %d/%d bad values (%.1f%%) worst: %r @ %s'
  199. % (level, options.metric, ttags, options.comparator, threshold,
  200. nbad, npoints, nbad * 100.0 / npoints, badval, badts))
  201. return rv
  202. if __name__ == '__main__':
  203. sys.exit(main(sys.argv))

2)参数说明

  1. Options:
  2. -h, --help 打印帮助信息
  3. -H HOST, --host=HOST 指定TSDB服务器
  4. -p PORT, --port=PORT 指定TSDB端口
  5. -m METRIC, --metric=METRIC
  6. 查询的metric
  7. -t TAG, --tag=TAG metric过滤的tags
  8. -d SECONDS, --duration=SECONDS
  9. 查询多久的数据,默认600s
  10. -D METHOD, --downsample=METHOD
  11. 缩减像素采样函数有avg, min, sum, or max.
  12. -W SECONDS, --downsample-window=SECONDS
  13. 缩减像素采样窗口大小
  14. -a METHOD, --aggregator=METHOD
  15. 聚合方法: avg, min, sum (default), max.
  16. -x METHOD, --method=METHOD
  17. 比较法: gt, ge, lt, le, eq, ne.
  18. -r, --rate 使用速率值作为比较的操作数
  19. -w THRESHOLD, --warning=THRESHOLD
  20. warning阀值.
  21. -c THRESHOLD, --critical=THRESHOLD
  22. critical阀值.
  23. -v, --verbose 显示详细信息
  24. -T SECONDS, --timeout=SECONDS
  25. 等待TSDB响应时间
  26. -E, --no-result-ok TSD查询没有结果返回时,返回OK状态
  27. -I SECONDS, --ignore-recent=SECONDS
  28. 忽略最近的数据点
  29. -S, --ssl 通过SSL查询OpenTSDB (https)

3)报警接入

Nagios服务器搭建

将check_tsd脚本放在Nagios服务器/usr/local/nagios/libexec目录里。
编辑/usr/local/nagios/etc/objects/commands.cnf文件,添加下面的命令:
  1. define command{
  2. command_name check_tsd
  3. command_line $USER1$/check_tsd -H 10.0.101.145 -p 4242 -t host=$HOSTADDRESS$ $ARG1$
  4. }
  5. ###添加监控项
  6. define service {
  7. use MongoDB
  8. host_name 10.0.0.166
  9. service_description query opcounters per second
  10. check_command check_tsd!-m rate:mongo.opcounters -t type=query -d 60 -w 300 -c 500
  11. }

在此输入正文

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注