@bergus
2015-10-23T02:21:48.000000Z
字数 2450
阅读 2616
python Cmd 爬虫
废话少说,直接上代码
# encoding=utf-8import osimport multiprocessingfrom cmd import Cmdimport commandsfrom mycrawler.dbUtil import DbUtilimport signal# 下载监控def run_download_watch():os.system("gnome-terminal -x bash -c 'python ./download_process.py' ")# 下载文件def run_download():os.system("gnome-terminal -x bash -c 'python ./download.py' ")# 爬虫def run_spider(arg):for i in range(len(arg)):os.system("gnome-terminal -x bash -c 'scrapy crawl %s'" % arg[i])class CLI(Cmd):def __init__(self):Cmd.__init__(self)# 设置命令提示符self.prompt = ">>> "self.intro = '''欢迎进入爬虫控制台输入0:查看帮助'''self.doc_header = ''self.undoc_header = ''self.nohelp = "*** 命令%s没有帮助文档"def do_download(self, arg):''' 下载文件'''p2 = multiprocessing.Process(target=run_download)p2.start()def do_0(self, arg):self.do_help(1)def do_help(self, arg):def ddoc(ss, arg):try:doc = getattr(ss, 'do_' + arg).__doc__if doc:print arg + ":"print docreturnexcept AttributeError:ss.stdout.write("%s\n" % str(ss.nohelp % (arg,)))cmds_doc = []for name in self.get_names():if name[:3] == 'do_':cmds_doc.append(name[3:])print self.doc_headerfor c in cmds_doc:ddoc(self, c)# 添加新的爬虫连接def do_add(self, args):""" 新增链接(厂商网址)到数据库中输入格式为:add name abb;start_urls www.baidu.com www.baidu.com www.baidu.comadd是添加命令,后面的是参数。start_urls后面可以跟随多条数据,空格分开"""if not args:print "输入内容为空,请查看帮助:help add"returnprint argsdata = dict([(bb.split(' ')[0], len(bb.split(' ')[1:]) == 1 and bb.split(' ')[1] or bb.split(' ')[1:]) for bb in args.split(';')])print dataDbUtil().conn().collection('url_items').insert(data)# 列出所有的爬虫def do_list_spider(self, args):''' 列出所有的爬虫'''print commands.getoutput("scrapy list")# 运行一个爬虫def do_run_spider(self, arg):''' 运行一个爬虫,例如run_spider abb'''p3 = multiprocessing.Process(target=run_spider, args=(arg,))p3.start()# os.system('scrapy crawl ' + arg)def do_run(self, args):''' 运行所有的程序'''# 运行爬虫self.do_run_all_spiders(1)# 运行下载p2 = multiprocessing.Process(target=run_download)p2.start()# 运行下载监控p3 = multiprocessing.Process(target=run_download_watch)p3.start()# 运行所有的爬虫def do_run_all_spiders(self, arg):''' 运行所有的爬虫'''s = cmd('scrapy list').value().split('\n')if not s:print "没有爬虫,请检验代码是否正确"returnp = multiprocessing.Process(target=run_spider, args=(s,))p.start()# os.system('./run_spider.sh ' + spider)def do_q(self, arg):''' 退出系统'''return True# emptylinedef emptyline(self):os.system('clear')print '回车清屏,help帮助,tab补全'# 当无法识别输入的command时调用该方法def default(self, line):print '输入的命令' + repr(line) + '错误,请输入help查看命令帮助'# 退出之后调用该方法def postloop(self):print '谢谢使用'def completedefault(self, *ignored):return ['add', 'run_spider', 'run_all_spiders', 'list_spider']if __name__ == "__main__":cli = CLI()cli.cmdloop()
