[关闭]
@xuduochoua 2016-05-23T07:08:52.000000Z 字数 27419 阅读 1844

VIC简单爬虫程序

爬虫 demo


项目说明

  1. maven项目
  2. 利用Jsonp完成对网页内容的抓取,包括文章列表,文章详情,以及
  3. - 列表项
  4. 详情中的图片,并写入本地文件

一、准备工作

1. 搭建mavne工程

pom文件如下

  1. <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2. xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3. <modelVersion>4.0.0</modelVersion>
  4. <groupId>craw</groupId>
  5. <artifactId>craw</artifactId>
  6. <version>0.0.1-SNAPSHOT</version>
  7. <dependencies>
  8. <dependency>
  9. <groupId>commons-httpclient</groupId>
  10. <artifactId>commons-httpclient</artifactId>
  11. <version>3.1</version>
  12. </dependency>
  13. <dependency>
  14. <groupId>org.codehaus.jackson</groupId>
  15. <artifactId>jackson-mapper-asl</artifactId>
  16. <version>1.9.12</version>
  17. </dependency>
  18. <!-- 提供一些基础的、通用的操作和处理,如自动生成toString()的结果、自动实现hashCode()和equals()方法、数组操作、枚举、日期和时间的处理等等。 -->
  19. <dependency>
  20. <groupId>commons-lang</groupId>
  21. <artifactId>commons-lang</artifactId>
  22. <version>2.6</version>
  23. </dependency>
  24. <!-- Commons项目中用来处理常用的编码方法的工具类包,例如DES、SHA1、MD5、Base64,URL,Soundx等等。[1]
  25. 不仅是编码,也可用于解码。 -->
  26. <dependency>
  27. <groupId>commons-codec</groupId>
  28. <artifactId>commons-codec</artifactId>
  29. <version>1.10</version>
  30. </dependency>
  31. <!-- Commons-loggin的目的是为“所有的Java日志实现”提供一个统一的接口,它自身的日志功能平常弱 -->
  32. <dependency>
  33. <groupId>commons-logging</groupId>
  34. <artifactId>commons-logging</artifactId>
  35. <version>1.2</version>
  36. </dependency>
  37. <dependency>
  38. <groupId>commons-io</groupId>
  39. <artifactId>commons-io</artifactId>
  40. <version>2.4</version>
  41. </dependency>
  42. <dependency>
  43. <groupId>org.jsoup</groupId>
  44. <artifactId>jsoup</artifactId>
  45. <version>1.8.3</version>
  46. </dependency>
  47. <!-- slf4j 配合 logback 做日志 start-->
  48. <dependency>
  49. <groupId>org.slf4j</groupId>
  50. <artifactId>slf4j-api</artifactId>
  51. <version>1.7.8</version>
  52. </dependency>
  53. <dependency>
  54. <groupId>ch.qos.logback</groupId>
  55. <artifactId>logback-classic</artifactId>
  56. <version>1.1.3</version>
  57. </dependency>
  58. <dependency>
  59. <groupId>ch.qos.logback</groupId>
  60. <artifactId>logback-core</artifactId>
  61. <version>1.1.3</version>
  62. </dependency>
  63. <!-- slf4j 配合 logback 做日志 end-->
  64. </dependencies>
  65. <build>
  66. <plugins>
  67. <plugin>
  68. <groupId>org.apache.maven.plugins</groupId>
  69. <artifactId>maven-compiler-plugin</artifactId>
  70. <configuration>
  71. <source>1.7</source>
  72. <target>1.7</target>
  73. </configuration>
  74. </plugin>
  75. </plugins>
  76. </build>
  77. </project>

2. 引入slf4j日志

  1. 引入jar包
  2. 配置src下logback.xml
  1. <?xml version="1.0" encoding="UTF-8"?>
  2. <!--
  3. scan:当此属性设置为true时,配置文件如果发生改变,将会被重新加载,默认值为true。
  4. scanPeriod:设置监测配置文件是否有修改的时间间隔,如果没有给出时间单位,默认单位是毫秒当scan为true时,此属性生效。默认的时间间隔为1分钟。
  5. debug:当此属性设置为true时,将打印出logback内部日志信息,实时查看logback运行状态。默认值为false。
  6. -->
  7. <configuration scan="false" scanPeriod="60 seconds" debug="false">
  8. <!-- 定义日志的根目录 -->
  9. <property name="LOG_HOME" value="/app/log" />
  10. <!-- 定义日志文件名称 -->
  11. <property name="appName" value="netty"></property>
  12. <!-- ch.qos.logback.core.ConsoleAppender 表示控制台输出 -->
  13. <appender name="stdout" class="ch.qos.logback.core.ConsoleAppender">
  14. <Encoding>UTF-8</Encoding>
  15. <!--
  16. 日志输出格式:%d表示日期时间,%thread表示线程名,%-5level:级别从左显示5个字符宽度
  17. %logger{50} 表示logger名字最长50个字符,否则按照句点分割。 %msg:日志消息,%n是换行符
  18. -->
  19. <layout class="ch.qos.logback.classic.PatternLayout">
  20. <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n</pattern>
  21. </layout>
  22. </appender>
  23. <!-- 滚动记录文件,先将日志记录到指定文件,当符合某个条件时,将日志记录到其他文件 -->
  24. <appender name="appLogAppender" class="ch.qos.logback.core.rolling.RollingFileAppender">
  25. <Encoding>UTF-8</Encoding>
  26. <!-- 指定日志文件的名称 -->
  27. <file>${LOG_HOME}/${appName}.log</file>
  28. <!--
  29. 当发生滚动时,决定 RollingFileAppender 的行为,涉及文件移动和重命名
  30. TimeBasedRollingPolicy: 最常用的滚动策略,它根据时间来制定滚动策略,既负责滚动也负责出发滚动。
  31. -->
  32. <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
  33. <!--
  34. 滚动时产生的文件的存放位置及文件名称 %d{yyyy-MM-dd}:按天进行日志滚动
  35. %i:当文件大小超过maxFileSize时,按照i进行文件滚动
  36. -->
  37. <fileNamePattern>${LOG_HOME}/${appName}-%d{yyyy-MM-dd}-%i.log</fileNamePattern>
  38. <!--
  39. 可选节点,控制保留的归档文件的最大数量,超出数量就删除旧文件。假设设置每天滚动,
  40. 且maxHistory是365,则只保存最近365天的文件,删除之前的旧文件。注意,删除旧文件是,
  41. 那些为了归档而创建的目录也会被删除。
  42. -->
  43. <MaxHistory>365</MaxHistory>
  44. <!--
  45. 当日志文件超过maxFileSize指定的大小是,根据上面提到的%i进行日志文件滚动 注意此处配置SizeBasedTriggeringPolicy是无法实现按文件大小进行滚动的,必须配置timeBasedFileNamingAndTriggeringPolicy
  46. -->
  47. <timeBasedFileNamingAndTriggeringPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedFNATP">
  48. <maxFileSize>100MB</maxFileSize>
  49. </timeBasedFileNamingAndTriggeringPolicy>
  50. </rollingPolicy>
  51. <!--
  52. 日志输出格式:%d表示日期时间,%thread表示线程名,%-5level:级别从左显示5个字符宽度 %logger{50} 表示logger名字最长50个字符,否则按照句点分割。 %msg:日志消息,%n是换行符
  53. -->
  54. <layout class="ch.qos.logback.classic.PatternLayout">
  55. <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [ %thread ] - [ %-5level ] [ %logger{50} : %line ] - %msg%n</pattern>
  56. </layout>
  57. </appender>
  58. <!--
  59. logger主要用于存放日志对象,也可以定义日志类型、级别
  60. name:表示匹配的logger类型前缀,也就是包的前半部分
  61. level:要记录的日志级别,包括 TRACE < DEBUG < INFO < WARN < ERROR
  62. additivity:作用在于children-logger是否使用 rootLogger配置的appender进行输出,false:表示只用当前logger的appender-ref,true:表示当前logger的appender-ref和rootLogger的appender-ref都有效
  63. -->
  64. <!-- hibernate logger -->
  65. <logger name="org.hibernate" level="error" />
  66. <!-- Spring framework logger -->
  67. <logger name="org.springframework" level="error" additivity="false"></logger>
  68. <logger name="com.vic.study" level="info" additivity="true">
  69. <appender-ref ref="appLogAppender" />
  70. </logger>
  71. <!--
  72. root与logger是父子关系,没有特别定义则默认为root,任何一个类只会和一个logger对应,
  73. 要么是定义的logger,要么是root,判断的关键在于找到这个logger,然后判断这个logger的appender和level。
  74. -->
  75. <root level="info">
  76. <appender-ref ref="stdout" />
  77. <appender-ref ref="appLogAppender" />
  78. </root>
  79. </configuration>

3. 引入常用工具类

详见VIC的工具类

最重要的为正则工具类

二、编码

1、 Jsonp连接类CrawlConnect

HTTP请求相关

  1. import com.vic.study.crawl.util.IOUtils;
  2. /**
  3. * HTTP请求工具类
  4. * @author VIC
  5. *
  6. */
  7. public class CrawlConnect {
  8. private final Logger logger = LoggerFactory.getLogger(CrawlConnect.class);
  9. private Connection connection;
  10. /**
  11. * 必定先调用这个方法
  12. * @param url
  13. * @return
  14. */
  15. public CrawlConnect(Connection connection){
  16. this.connection = connection;
  17. }
  18. public CrawlConnect url(String url){
  19. connection.url(url);
  20. return this;
  21. }
  22. public CrawlConnect url(URL url){
  23. connection.url(url);
  24. return this;
  25. }
  26. public CrawlConnect cookie(String name, String value){
  27. connection.cookie(name, value);
  28. return this;
  29. }
  30. public CrawlConnect cookie(Map<String, String> cookies){
  31. connection.cookies(cookies);
  32. return this;
  33. }
  34. public CrawlConnect data(String... keyvals){
  35. connection.data(keyvals);
  36. return this;
  37. }
  38. public CrawlConnect data(String key, String value ) {
  39. connection.data(key, value);
  40. return this;
  41. }
  42. public CrawlConnect data(String key, String filename, InputStream in){
  43. connection.data(key, filename, in);
  44. return this;
  45. }
  46. public Connection.Response execute() throws IOException{
  47. return connection.execute();
  48. }
  49. public CrawlConnect followRedirects(boolean followRedirects){
  50. connection.followRedirects(followRedirects);
  51. return this;
  52. }
  53. //get
  54. public Document getDocument() throws IOException {
  55. return connection.get();
  56. }
  57. public String getHtml() throws IOException{
  58. return this.getDocument().html();
  59. }
  60. public String getBodyText() throws IOException{
  61. return this.getDocument().body().text();
  62. }
  63. public CrawlConnect header(String key, String value) {
  64. connection.header(key, value);
  65. return this;
  66. }
  67. public CrawlConnect maxBodySize(int bytes) {
  68. connection.maxBodySize(bytes);
  69. return this;
  70. }
  71. public CrawlConnect method(Connection.Method method){
  72. connection.method(method);
  73. return this;
  74. }
  75. //Provide an alternate parser to use when parsing the response to a Document.
  76. public CrawlConnect parser(Parser parser){
  77. connection.parser(parser);
  78. return this;
  79. }
  80. //post
  81. public Document postDocument() throws IOException{
  82. return connection.post();
  83. }
  84. public String postHtml() throws IOException{
  85. return this.postDocument().html();
  86. }
  87. public String postBodyText() throws IOException{
  88. return this.postDocument().body().text();
  89. }
  90. //Sets the default post data character set for x-www-form-urlencoded post data
  91. public CrawlConnect postDataCharset(String charset) {
  92. connection.postDataCharset(charset);
  93. return this;
  94. }
  95. public Connection.Request request(){
  96. return connection.request();
  97. }
  98. public Connection.Response response(){
  99. return connection.response();
  100. }
  101. //Disable/enable TSL certificates validation for HTTPS requests.
  102. public CrawlConnect validateTLSCertificates(boolean value) {
  103. connection.validateTLSCertificates(value);
  104. return this;
  105. }
  106. /**
  107. * 下载文件到本地
  108. * @param path
  109. * @param fileName
  110. * @throws IOException
  111. */
  112. public void downFile(String path, String fileName) throws IOException{
  113. Response response = this.execute();
  114. File file = IOUtils.getFileByPathAndName(path, fileName);
  115. FileOutputStream out = new FileOutputStream(file);
  116. out.write(response.bodyAsBytes());
  117. out.close();
  118. }
  119. }

2. 爬虫基类BaseCrawl

爬虫类继承此抽象类,方便连接、操作网页

  1. import org.jsoup.Connection;
  2. import org.jsoup.Jsoup;
  3. import com.vic.study.crawl.util.PropertiesUtil;
  4. /**
  5. * 爬虫基类
  6. * @author VIC
  7. *
  8. */
  9. public abstract class BaseCrawl {
  10. /**
  11. * 连接网页
  12. * @param url
  13. * @return
  14. */
  15. protected CrawlConnect con(String url){
  16. Connection conn = Jsoup.connect(url).ignoreContentType(true).timeout(PropertiesUtil.getIntByKey("timeout"));
  17. return new CrawlConnect(conn);
  18. }
  19. }

3. 爬虫主程序CrawlCnblogList

根据分类抓取博客园下文章列表及详情及其他 写入本地文件

  1. import java.io.File;
  2. import java.io.IOException;
  3. import java.util.Date;
  4. import java.util.List;
  5. import java.util.UUID;
  6. import org.apache.commons.io.FileUtils;
  7. import org.apache.commons.lang.StringUtils;
  8. import org.slf4j.Logger;
  9. import org.slf4j.LoggerFactory;
  10. import com.vic.study.crawl.util.IOUtils;
  11. import com.vic.study.crawl.util.RegexUtil;
  12. /**
  13. * 抓取博客园的列表以及详情
  14. * @author VIC
  15. *
  16. */
  17. public class CrawlCnblogList extends BaseCrawl{
  18. private Logger logger = LoggerFactory.getLogger(CrawlCnblogList.class);
  19. private static final String SAVE_LOCATION = "E:/crawl/";//下载的保存位置
  20. private static final String baseUrl = "http://www.cnblogs.com/cate/";
  21. private static final String listRegex = "<div id=\"post_list\">(.*?)</div>\\s*<script>"; //列表页
  22. private static final String itemRegex = "<div class=\"post_item\">(.*?)<div class=\"clear\"></div>\\s*</div>"; //提取列表页的每个item
  23. private static final String titleRegex = "<h3><a class=\"titlelnk\" href=\".*?\" target=\"_blank\">(.*?)</a></h3>";//标题
  24. private static final String linkRegex = "<h3><a class=\"titlelnk\" href=\"(.*?)\" target=\"_blank\">.*?</a></h3>";//链接
  25. private static final String descRegex = "<p class=\"post_item_summary\"> (.*?)</p>";//简介
  26. private static final String authorRegex = "<a href=\".*?\" class=\"lightblue\">(.*?)</a>";//作者
  27. private static final String timeRegex = "发布于 (.*?)<span class=\"article_comment\">";//发布时间
  28. private static final String detailRegex = "<div id=\"cnblogs_post_body\">(.*?)</div>\\s*<div id=\"MySignature\"></div>";//文章详情内容
  29. private static final String imgRegex = "<img .*? src=\"(.*?)\" .*?>";//详情页图片地址
  30. /**
  31. * 根据分类抓取博客园下文章列表及详情及其他 写入本地文件
  32. * @param categorys 如java/,php/
  33. * @throws IOException
  34. */
  35. public void list(String...categories) {
  36. if(categories.length < 1) {
  37. logger.info("请输入至少一个分类");
  38. return;
  39. }
  40. for(String category : categories){
  41. String url = baseUrl + category +"/";
  42. try {
  43. String basePath = SAVE_LOCATION + category +"/" + new Date().getTime();
  44. //获取文章列表
  45. String html = con(url).getHtml();
  46. String itemsStr = RegexUtil.getFirstString(html, listRegex, 1);
  47. List<String> list = RegexUtil.getList(itemsStr, itemRegex, 1);
  48. for(String item : list) {
  49. String title = RegexUtil.getFirstString(item, titleRegex,1);
  50. String link = RegexUtil.getFirstString(item, linkRegex,1);
  51. String desc = RegexUtil.getFirstString(item, descRegex,1);
  52. String author = RegexUtil.getFirstString(item, authorRegex,1);
  53. String time = RegexUtil.getFirstString(item, timeRegex,1);
  54. StringBuffer baseContent = new StringBuffer();
  55. baseContent.append("标题:").append(title).append("\r\n");
  56. baseContent.append("链接:").append(link).append("\r\n");
  57. baseContent.append("作者:").append(author).append("\r\n");
  58. baseContent.append("时间:").append(time).append("\r\n");
  59. baseContent.append("简介:").append(desc).append("\r\n");
  60. String titleDir = StringUtils.isEmpty(title)? ("title" +UUID.randomUUID()) : title;
  61. String baseDir = basePath + File.separator + titleDir ;//每一篇文章存的目录
  62. String fileName = (StringUtils.isEmpty(title)? ("title" +UUID.randomUUID()) : title ) + ".txt";
  63. File baseFile = IOUtils.getFileByPathAndName(baseDir, fileName);
  64. FileUtils.write(baseFile, baseContent.toString(), "utf-8");
  65. //获取详情
  66. try{
  67. String detailPath = baseDir + "/detail/";
  68. String detail = con(link).getHtml();
  69. String content = RegexUtil.getFirstString(detail, detailRegex, 1);
  70. FileUtils.writeStringToFile(IOUtils.getFileByPathAndName(detailPath, fileName), content, "utf-8");
  71. List<String> imgs = RegexUtil.getList(content, imgRegex, 1);
  72. if(imgs.size() > 0) {
  73. for(String img : imgs){
  74. String imgName = img.substring(img.lastIndexOf("/")+ 1, img.length());
  75. try{
  76. con(img).downFile(detailPath, imgName);
  77. logger.warn("图片{}下载成功{}",imgName, img);
  78. }catch(Exception e) {
  79. logger.warn("图片{}下载失败",imgName);
  80. }
  81. }
  82. }
  83. }catch(Exception e){
  84. logger.warn("获得{}详情失败:{}", title, link);
  85. }
  86. }
  87. } catch (IOException e) {
  88. logger.warn("获得{}列表失败:{}", categories, url);
  89. e.printStackTrace();
  90. }
  91. }
  92. }
  93. public static void main(String[] args) {
  94. CrawlCnblogList l = new CrawlCnblogList();
  95. l.list("php", "java");
  96. }
  97. }

备注

这是个垂直抓取的网络爬虫Demo,

由于未察觉博客园对请求header做相关验证,本Demo中并未对头信息作相应伪装。

HTTPUnit

  1. package com.guoyuan.scene.database.test;
  2. import java.net.URL;
  3. import java.util.List;
  4. import com.gargoylesoftware.htmlunit.Page;
  5. import com.gargoylesoftware.htmlunit.WebClient;
  6. import com.gargoylesoftware.htmlunit.WebRequest;
  7. import com.gargoylesoftware.htmlunit.html.HtmlButton;
  8. import com.gargoylesoftware.htmlunit.html.HtmlDivision;
  9. import com.gargoylesoftware.htmlunit.html.HtmlImageInput;
  10. import com.gargoylesoftware.htmlunit.html.HtmlInput;
  11. import com.gargoylesoftware.htmlunit.html.HtmlPage;
  12. public class HttpTest {
  13. public static void main(String[] args) throws Exception {
  14. String url = "http://oa.gyzq.com.cn:8082/welcome.do";// 想采集的网址
  15. String refer = "http://outofmemory.cn/";
  16. URL link = new URL(url);
  17. WebClient wc = new WebClient();
  18. WebRequest request = new WebRequest(link);
  19. request.setCharset("UTF-8");
  20. request.setAdditionalHeader("Referer", refer);// 设置请求报文头里的refer字段
  21. // //设置请求报文头里的User-Agent字段
  22. request
  23. .setAdditionalHeader("User-Agent",
  24. "Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
  25. // wc.addRequestHeader("User-Agent",
  26. // "Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
  27. // wc.addRequestHeader和request.setAdditionalHeader功能应该是一样的。选择一个即可。
  28. // 其他报文头字段可以根据需要添加
  29. wc.getCookieManager().setCookiesEnabled(true);// 开启cookie管理
  30. wc.getOptions().setJavaScriptEnabled(true);// 开启js解析。对于变态网页,这个是必须的
  31. wc.getOptions().setCssEnabled(true);// 开启css解析。对于变态网页,这个是必须的。
  32. wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
  33. wc.getOptions().setThrowExceptionOnScriptError(false);
  34. wc.getOptions().setTimeout(10000);
  35. // 设置cookie。如果你有cookie,可以在这里设置
  36. // Set<Cookie> cookies = null;
  37. // Iterator<Cookie> i = cookies.iterator();
  38. // while (i.hasNext())
  39. // {
  40. // wc.getCookieManager().addCookie(i.next());
  41. // }
  42. // 准备工作已经做好了
  43. HtmlPage page = null;
  44. page = wc.getPage(request);
  45. if (page == null) {
  46. System.out.println("采集 " + url + " 失败!!!");
  47. return;
  48. }
  49. String content = page.asXml();// 网页内容保存在content里
  50. if (content == null) {
  51. System.out.println("采集 " + url + " 失败!!!");
  52. return;
  53. }
  54. List<?> inputList = page.getByXPath("//input[@name='image']");
  55. HtmlImageInput image = (HtmlImageInput)inputList.get(0);
  56. HtmlInput userid = (HtmlInput)page.getHtmlElementById("userId");
  57. HtmlInput pass = (HtmlInput)page.getHtmlElementById("pass");
  58. userid.setValueAttribute("0254");
  59. pass.setValueAttribute("password123");
  60. image.click();
  61. request = new WebRequest(new URL("http://oa.gyzq.com.cn:8082/UIProcessor?Table=vGYMS_JBSQ"));
  62. request.setCharset("UTF-8");
  63. request.setAdditionalHeader("Referer", refer);// 设置请求报文头里的refer字段
  64. // //设置请求报文头里的User-Agent字段
  65. request
  66. .setAdditionalHeader("User-Agent",
  67. "Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
  68. HtmlPage page2 = wc.getPage(request);
  69. List<?> buttonList = page2.getElementsByTagName("button");
  70. List<?> divList = page2.getByXPath("//div[contains(@class,'x-grid3-row')]");
  71. for (int i = 0; i < buttonList.size(); i++) {
  72. HtmlButton button = (HtmlButton)buttonList.get(i);
  73. System.out.println(button.getTextContent());
  74. }
  75. HtmlButton button = (HtmlButton)page2.getElementById("ext-gen39");
  76. for (int i = 0; i < divList.size(); i++) {
  77. HtmlDivision div = (HtmlDivision)divList.get(i);
  78. div.click();
  79. if (button != null) {
  80. Page popup = button.click();
  81. if (popup.isHtmlPage()) {
  82. System.out.println(((HtmlPage) popup).asXml());
  83. }
  84. }
  85. System.out.println(div.asXml());
  86. }
  87. // Page page2 = image.click();
  88. // if (page2.isHtmlPage()) {
  89. // HtmlPage htmlPage = (HtmlPage) page2;
  90. // List<FrameWindow> iframeList = htmlPage.getFrames();
  91. // for (FrameWindow frame : iframeList) {
  92. // if (frame.getName().equals("fraRightFrame")) {
  93. // final HtmlPage pageTwo = (HtmlPage) frame.getEnclosedPage();
  94. // HtmlAnchor anchor = pageTwo.getAnchorByText("陈洋于2015-08-24发起的发文申请流程");
  95. // HtmlPage page3 = anchor.click();
  96. // System.out.println(page3.asXml());
  97. // System.out.println(page3.asXml());
  98. // }
  99. // }
  100. // }
  101. // // 搞定了
  102. // CookieManager CM = wc.getCookieManager(); // WC = Your WebClient's name
  103. // Set<Cookie> cookies_ret = CM.getCookies();// 返回的Cookie在这里,下次请求的时候可能可以用上啦。
  104. // System.out.println(123);
  105. }
  106. }

ERPCrawl

  1. package com.vic.study.crawl.erp;
  2. import java.io.File;
  3. import java.io.FileInputStream;
  4. import java.io.IOException;
  5. import java.io.InputStream;
  6. import java.net.URLDecoder;
  7. import java.util.Date;
  8. import java.util.HashMap;
  9. import java.util.List;
  10. import java.util.Map;
  11. import java.util.UUID;
  12. import org.apache.commons.lang.time.DateFormatUtils;
  13. import org.apache.commons.lang.time.DateUtils;
  14. import org.jsoup.Connection.Method;
  15. import org.jsoup.Connection.Response;
  16. import org.jsoup.nodes.Document;
  17. import org.jsoup.nodes.Element;
  18. import com.vic.study.crawl.BaseCrawl;
  19. import com.vic.study.crawl.util.ERPContants;
  20. import com.vic.study.crawl.util.RegexUtil;
  21. /**
  22. * 抓取ERP 自动提交加班流程
  23. * @author VIC
  24. *
  25. */
  26. public class ErpCrawl extends BaseCrawl{
  27. public static void printMap(Map<String, String> map){
  28. System.out.println("打印map start--------------->>");
  29. for(Map.Entry<String, String> entry : map.entrySet()) {
  30. System.out.println(entry.getKey() + "—" + entry.getValue());
  31. }
  32. System.out.println("打印map end<<---------------------");
  33. }
  34. /**
  35. * 登陆
  36. * @return
  37. */
  38. public Map<String, String> login(){
  39. try {
  40. Map<String, String> cookies = con(ERPContants.LOGIN_URL).data("userId", ERPContants.USERNAME).data("pass", ERPContants.PASSWORD).execute().cookies();
  41. if(cookies != null) {
  42. Document docemnet = con(ERPContants.HOME_PAGE_URL).cookie(cookies).getDocument();
  43. Element userNameEle=docemnet.getElementById("userNameCt");
  44. if(userNameEle!=null){
  45. logger.warn("{} 登陆成功" , ERPContants.USERNAME);
  46. return cookies;
  47. }else{
  48. logger.warn("{}登陆失败" , ERPContants.USERNAME);
  49. }
  50. }
  51. } catch (IOException e) {
  52. logger.warn("{} 登陆失败",ERPContants.USERNAME);
  53. }
  54. return null;
  55. }
  56. /**
  57. * 2 加班流程
  58. */
  59. public void loadOvertimeProcess(){
  60. Map<String, String> cookies = login();
  61. if(cookies == null){
  62. return ;
  63. }
  64. String processNo = "";//流程编号
  65. try {
  66. String html = con(ERPContants.OVERTIME_PROCESS_URL).cookie(cookies).getHtml();
  67. String token = RegexUtil.getFirstString(html, "Token=(.*?)&", 1);
  68. String workId = RegexUtil.getFirstString(html, "&WorkID=(\\d+?)&", 1);
  69. String StepID = "initialStep";
  70. String WorkActionID = RegexUtil.getFirstString(html, "&WorkActionID=(\\d+?)&", 1);
  71. logger.info("进入加班流程页面 token:{},workID:{},WorkActionID:{}", token,workId, WorkActionID);
  72. processNo = RegexUtil.getFirstString(html, "提出的加班申请\\[(\\d+?)\\]", 1);
  73. /**
  74. * 添加加班详细信息
  75. */
  76. goAddDetail(cookies, token, workId);
  77. /**
  78. * 添加附加
  79. */
  80. goAddAttachment(cookies, token, workId);
  81. // if(1==1) return;
  82. // String fromHtml = RegexUtil.getFirstString(html, "<form .*?>(.*?)</form>", 1);//提取form
  83. // List<String> nameList = RegexUtil.getList(fromHtml, "(<.*? name=.*?>)", 1);//提取所有的带有name的标签
  84. //提取结果为 需要提交的参数
  85. Map<String, String> data = new HashMap<>();
  86. data.put("$C{LCBT}", "0");//流程标题
  87. data.put("$C{JBXXXX}", "12");
  88. data.put("$C{JBNR}", "7");//加班内容
  89. data.put("JBNR", "具体的加班内容");// 具体的加班内容 手动填写
  90. data.put("$C{FJ}", "12");//附件
  91. data.put("$C{SPYJ}", "0");//审批意见
  92. //data.put("SPYJ", "");//审批意见
  93. data.put("$C{CSR}", "25");//抄送信息
  94. //data.put("CSR", "");
  95. data.put("$C{SQR}", "0");//申请人
  96. data.put("SQR", "60");//申请人 需/可 抓取
  97. data.put("$C{SQRBM}", "0");//申请人部门
  98. data.put("SQRBM", "142");//申请人部门 需/可 抓取
  99. data.put("$C{SQSJ}", "0");//申请时间
  100. data.put("SQSJ", "2015-11-18 15:09:44");//申请时间
  101. // 保存时通用参数 Token WorkID StepID
  102. data.put("WorkID", workId);
  103. data.put("StepID", StepID);
  104. data.put("Token", token);
  105. data.put("WorkActionID", WorkActionID);
  106. //抓取相关参数
  107. String SQRBM = RegexUtil.getFirstString(html, "name=\"SQRBM\".*?value=\"(\\d*?)\".*?>", 1);//申请人部门
  108. data.put("SQRBM", SQRBM);
  109. String SQR = RegexUtil.getFirstString(html, "name=\"SQR\".*?value=\"(\\d*?)\".*?>", 1);//申请人;
  110. data.put("SQR", SQR);
  111. //手动添加加班内容
  112. data.put("JBNR", "这是程序添加的测试加班内容" + UUID.randomUUID() + " 时间:" + DateFormatUtils.format(new Date(), "yyyy-MM-dd HH:mm"));
  113. // 提交加班流程
  114. String resultHtml = con(ERPContants.SUBMIT_OVERTIME_PROCESS_URL).cookie(cookies).data(data).postHtml();
  115. if(resultHtml.contains("执行提交成功")){
  116. logger.info("本次成功提交的流程编号为:{}" , RegexUtil.getFirstString(resultHtml, "提出的加班申请\\[(\\d*?)\\]", 1));
  117. System.out.println("processNo:" + processNo);
  118. }
  119. } catch (IOException e) {
  120. logger.warn("OPEN加班流程{}失败" , ERPContants.OVERTIME_PROCESS_URL);
  121. }
  122. }
  123. /**
  124. * 3 前往新增加班流程明细 并提交
  125. * 当前token workID 与 “2 加班流程”中的一致 尚未改变
  126. */
  127. public void goAddDetail(Map<String, String> cookies, String token, String workID){
  128. try {
  129. String detailHtml = con(ERPContants.ADD_OVERTIME_PROCESS_DETAIL_URL).cookie(cookies).data("Token", token).data("WorkID", workID).getHtml();
  130. String operateID = RegexUtil.getFirstString(detailHtml, ";OperateID=(.*?)&", 1);
  131. logger.info("前往新增加班流程明细 token:{},workID:{},operateID:{}", token,workID, operateID);
  132. Map<String, String> params = new HashMap<>();
  133. params.put("OperateID", operateID);
  134. // params.put("Token", token);
  135. params.put("WorkID", workID);
  136. params.put("XM", "60");//姓名
  137. params.put("$C{XM}", "0");
  138. params.put("BM", "盐城解放南路证券营业部");//部门
  139. params.put("KSSJ", DateFormatUtils.format(DateUtils.addHours(new Date(), 6) ,ERPContants.TIME_PATTERN));//加班开始时间
  140. params.put("JSSJ", DateFormatUtils.format(DateUtils.addHours(new Date(), 8) ,ERPContants.TIME_PATTERN));//加班结束时间
  141. params.put("JBSJ", "2.0");//加班时间(小时)
  142. params.put("TXHBZ", "1");//调休或补助 0 -1
  143. try{
  144. String html = con(ERPContants.SUBMIT_OVERTIME_PROCESS_DETAIL_URL).cookie(cookies).data(params).postHtml();
  145. if(html.contains("执行新增成功")){
  146. logger.info("成功新增加班流程明细");
  147. }
  148. }catch(Exception e){
  149. logger.warn("提交加班明细{}失败" , ERPContants.SUBMIT_OVERTIME_PROCESS_DETAIL_URL);
  150. }
  151. } catch (IOException e) {
  152. e.printStackTrace();
  153. logger.warn("前往加班明细{}失败" , ERPContants.ADD_OVERTIME_PROCESS_DETAIL_URL);
  154. }
  155. }
  156. /**
  157. * 4 前往添加附件
  158. * 当前token workID 与传入的token workId尚且一致
  159. * operateID 和步骤3中的 operateID 不一致
  160. */
  161. public void goAddAttachment(Map<String, String> cookies, String token, String workID){
  162. InputStream in = null;
  163. try {
  164. // String attachmentHtml = con(ERPContants.ADD_ATTACHMENT_URL).cookie(cookies).data("Token", token).data("WorkID", workID).getHtml();
  165. Response attachmentResponse = con(ERPContants.ADD_ATTACHMENT_URL).cookie(cookies).data("Token", token).data("WorkID", workID).execute();
  166. String refererUrl = attachmentResponse.url().toString();
  167. String attachmentHtml = attachmentResponse.parse().html();
  168. String operateID = RegexUtil.getFirstString(attachmentHtml, ";OperateID=(.*?)&", 1);
  169. logger.info("前往添加附件 token:{},workID:{},operateID:{}", token,workID, operateID);
  170. Map<String, String> params = new HashMap<>();//参数 文件参数为 FJ
  171. params.put("OperateID", operateID);
  172. params.put("Token", token);
  173. params.put("WorkID", workID);
  174. params.put("operate", "Add");
  175. /*params.put("$C{FJMC}", "1");
  176. params.put("$C{FJ}", "9");
  177. params.put("$C{BZ}", "7");*/
  178. params.put("BZ", "这是一个附件备注" +UUID.randomUUID());
  179. File file = new File("E:\\pic\\4.jpg");
  180. if(!file.exists()){
  181. logger.warn("不存在的附件");
  182. return ;
  183. }
  184. params.put("FJMC", file.getName());//附件名称
  185. in = new FileInputStream(file);
  186. Response response = con(ERPContants.SUBMIT_ATTACHMENT_URL).header("Content-Type", "multipart/form-data").
  187. cookie(cookies).data(params).data("FJ", file.getName(), in).setMethod(Method.POST).execute();
  188. String result = response.parse().html();
  189. System.out.println("ADD RESULT");
  190. System.out.println(result);
  191. // if(response.body().contains("执行新增成功")){
  192. // logger.info("提交附件成功");
  193. // logger.info("返回提交附件的结果为\n{{}}" ,RegexUtil.getFirstString(result, "var params =\\{(.*?)\\};", 1));
  194. // }
  195. /*
  196. * var params ={"message":"执行新增成功.","data":{ "results": 2, "records": [{"id":"-1001","FJMC":"123请问","FJ":"3.jpg","BZ":""},{"id":"-1002","FJMC":"额外发","FJ":"3.jpg","BZ":"阿
  197. 萨德"}]},"retVal":true,"success":true};
  198. * */
  199. } catch (IOException e) {
  200. e.printStackTrace();
  201. logger.info("添加附件失败");
  202. }finally {
  203. try {
  204. in.close();
  205. } catch (IOException e) {
  206. // TODO Auto-generated catch block
  207. e.printStackTrace();
  208. }
  209. }
  210. }
  211. /********************************************************************/
  212. public static void main(String[] args) {
  213. ErpCrawl e = new ErpCrawl();
  214. // e.testupload();
  215. e.loadOvertimeProcess();
  216. // testUrl();
  217. }
  218. /********************************************************************/
  219. public void testupload(){
  220. File file = new File("E:\\pic\\4.jpg");
  221. if(!file.exists()){
  222. logger.warn("不存在的附件");
  223. return ;
  224. }
  225. try{
  226. Response response = con("http://121.41.76.50/upfile/ajax/upfile").data("upfile", file.getName(), new FileInputStream(file)).setMethod(Method.POST).execute();
  227. System.out.println(response.body());
  228. System.out.println(response.statusCode());
  229. }catch(IOException e){
  230. e.printStackTrace();
  231. }
  232. }
  233. public static void testUrl(){
  234. String url="http://oa.gyzq.com.cn:8082/OperateProcessor?operate=WorkSave&Table=wfGYMS_JBSQ&Token=c98007f224b69d598b13f30e27b217ff&WorkID=2577&StepID=4&&isSubmit=1&%24C%7BCSR%7D=25&%24C%7BFJ%7D=12&%24C%7BJBNR%7D=7&%24C%7BJBXXXX%7D=12&%24C%7BLCBT%7D=0&%24C%7BSPYJ%7D=0&%24C%7BSQRBM%7D=0&%24C%7BSQR%7D=0&%24C%7BSQSJ%7D=0&CSR=&JBNR=0002&SPYJ=&SQR=60&SQRBM=142&SQSJ=2015-11-18%2013%3A19%3A29&operate=Update";
  235. List<String[]> list = RegexUtil.getList(url, "&(.*?)=(.*?)&", new int[]{1,2});
  236. for(String us[] : list) {
  237. System.out.println(us[0] + " || " + URLDecoder.decode(us[0]) + " | " +us[1]);
  238. }
  239. System.out.println("00000000");
  240. //解析提交流程的URL $C{JBXXXX}==加班详细信息 $C{FJ}==附件 $C{SPYJ}==审批意见=0 $C{SQR}==申请人=0 SQRBM==申请人部门=142 WorkID
  241. String url2 = "http://oa.gyzq.com.cn:8082/OperateProcessor?operate=WorkSave&Table=wfGYMS_JBSQ&Token=1cc072aef0e222b263d9c0e470dd8b1e&WorkID=40&StepID=initialStep&&isSubmit=1&%24C%7BCSR%7D=25&%24C%7BFJ%7D=12&%24C%7BJBNR%7D=7&%24C%7BJBXXXX%7D=12&%24C%7BLCBT%7D=0&%24C%7BSPYJ%7D=0&%24C%7BSQRBM%7D=0&%24C%7BSQR%7D=0&%24C%7BSQSJ%7D=0&CSR=&JBNR=%E6%B5%8B%E8%AF%95%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%822015%E5%B9%B411%E6%9C%8818%E6%97%A514%3A53%3A48&SPYJ=&SQR=60&SQRBM=142&SQSJ=2015-11-18%2014%3A50%3A51&operate=Add";
  242. List<String[]> list2 = RegexUtil.getList(url2, "&(.*?)=(.*?)&", new int[]{1,2});
  243. for(String us[] : list2) {
  244. System.out.println(us[0] + " || " + URLDecoder.decode(us[0]) + " | " +us[1]);
  245. }
  246. String url5 = "&%24C%7BCSR%7D=25&%24C%7BFJ%7D=12&%24C%7BJBNR%7D=7&%24C%7BJBXXXX%7D=12&%24C%7BLCBT%7D=0&%24C%7BSPYJ%7D=0&%24C%7BSQRBM%7D=0&%24C%7BSQR%7D=0&%24C%7BSQSJ%7D=0&CSR=&JBNR=%E9%98%BF%E8%90%A8%E5%BE%B7%E9%98%BF%E8%90%A8%E5%BE%B7%E6%B5%8B%E8%AF%95%20%E6%B5%8B%E8%AF%95%20%E6%B5%8B%E8%AF%95&SPYJ=&SQR=60&SQRBM=142&SQSJ=2015-11-19%2011%3A07%3A59&";
  247. List<String[]> list5 = RegexUtil.getList(url5, "&(.*?)=(.*?)&", new int[]{1,2});
  248. for(String us[] : list5) {
  249. System.out.println(us[0] + " || " + URLDecoder.decode(us[0]) + " | " +us[1]);
  250. }
  251. }
  252. }

ErpContants

  1. package com.vic.study.crawl.util;
  2. public class ERPContants {
  3. public static final String TIME_PATTERN = "yyyy-MM-dd HH:mm";
  4. public static final String USERNAME = "0254";
  5. public static final String PASSWORD = "password123";
  6. public static final String BASE_URL = "http://oa.gyzq.com.cn:8082";// 网址
  7. public static final String LOGIN_URL = "http://oa.gyzq.com.cn:8082/login.do";// 登陆
  8. public static final String HOME_PAGE_URL = "http://oa.gyzq.com.cn:8082/welcome.do";// 主页
  9. //加班流程
  10. public static final String OVERTIME_PROCESS_URL = "http://oa.gyzq.com.cn:8082/StartWorkflow?Workflow=wfGYMS_JBSQ&extWindow=true&PopupWin=true";
  11. //保存加班流程 Token WorkID StepID
  12. public static final String SAVE_OVERTIME_PROCESS_URL = "http://oa.gyzq.com.cn:8082/OperateProcessor?operate=WorkSave&Table=wfGYMS_JBSQ&isSubmit=1&operate=Update";
  13. //提交加班流程 WorkActionID Token WorkID
  14. public static final String SUBMIT_OVERTIME_PROCESS_URL = "http://oa.gyzq.com.cn:8082/OperateProcessor?operate=WorkAction.5&Table=wfGYMS_JBSQ&StepID=initialStep&&isSubmit=1&operate=Add";
  15. // Token WorkID 前往新增详细
  16. public static final String ADD_OVERTIME_PROCESS_DETAIL_URL = "http://oa.gyzq.com.cn:8082/WorkProcessor?Table=wfGYMS_JBSQ&StepID=initialStep&Column=JBXXXX&SubmitForm=DATA_FORM&EVENT_SOURCE=AddHypotObj&extWindow=true&PopupWin=true";
  17. // 提交详细 Token WorkID OperateID
  18. public static final String SUBMIT_OVERTIME_PROCESS_DETAIL_URL = "http://oa.gyzq.com.cn:8082/OperateProcessor?Table=wfGYMS_JBSQ&StepID=initialStep&extWindow=true&extWindow=true&PopupWin=true&NewWindow=false";
  19. //打开提交附件 Token WorkID
  20. public static final String ADD_ATTACHMENT_URL = "http://oa.gyzq.com.cn:8082/WorkProcessor?Table=wfGYMS_JBSQ&StepID=initialStep&Column=FJ&SubmitForm=DATA_FORM&EVENT_SOURCE=AddHypotObj&extWindow=true&PopupWin=true";
  21. //提交附件 OperateID WorkID Token
  22. public static final String SUBMIT_ATTACHMENT_URL = "http://oa.gyzq.com.cn:8082/OperateProcessor?Table=wfGYMS_JBSQ&StepID=7&extWindow=true&PopupWin=true&NewWindow=false";
  23. // /OperateProcessor?Table=wfGYMS_JBSQ&Token=c04a7c4eca425b738e8686487351af6f&WorkID=202&StepID=initialStep&OperateID=34b53c980468fe3fc10140f46acf28e9&extWindow=true
  24. }
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注