@xuduochoua
2016-05-23T07:08:52.000000Z
字数 27419
阅读 1844
爬虫 demo
maven项目利用Jsonp完成对网页内容的抓取,包括文章列表,文章详情,以及- 列表项详情中的图片,并写入本地文件
pom文件如下
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>craw</groupId><artifactId>craw</artifactId><version>0.0.1-SNAPSHOT</version><dependencies><dependency><groupId>commons-httpclient</groupId><artifactId>commons-httpclient</artifactId><version>3.1</version></dependency><dependency><groupId>org.codehaus.jackson</groupId><artifactId>jackson-mapper-asl</artifactId><version>1.9.12</version></dependency><!-- 提供一些基础的、通用的操作和处理,如自动生成toString()的结果、自动实现hashCode()和equals()方法、数组操作、枚举、日期和时间的处理等等。 --><dependency><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId><version>2.6</version></dependency><!-- Commons项目中用来处理常用的编码方法的工具类包,例如DES、SHA1、MD5、Base64,URL,Soundx等等。[1]不仅是编码,也可用于解码。 --><dependency><groupId>commons-codec</groupId><artifactId>commons-codec</artifactId><version>1.10</version></dependency><!-- Commons-loggin的目的是为“所有的Java日志实现”提供一个统一的接口,它自身的日志功能平常弱 --><dependency><groupId>commons-logging</groupId><artifactId>commons-logging</artifactId><version>1.2</version></dependency><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.4</version></dependency><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.8.3</version></dependency><!-- slf4j 配合 logback 做日志 start--><dependency><groupId>org.slf4j</groupId><artifactId>slf4j-api</artifactId><version>1.7.8</version></dependency><dependency><groupId>ch.qos.logback</groupId><artifactId>logback-classic</artifactId><version>1.1.3</version></dependency><dependency><groupId>ch.qos.logback</groupId><artifactId>logback-core</artifactId><version>1.1.3</version></dependency><!-- slf4j 配合 logback 做日志 end--></dependencies><build><plugins><plugin><groupId>org.apache.maven.plugins</groupId><artifactId>maven-compiler-plugin</artifactId><configuration><source>1.7</source><target>1.7</target></configuration></plugin></plugins></build></project>
<?xml version="1.0" encoding="UTF-8"?><!--scan:当此属性设置为true时,配置文件如果发生改变,将会被重新加载,默认值为true。scanPeriod:设置监测配置文件是否有修改的时间间隔,如果没有给出时间单位,默认单位是毫秒当scan为true时,此属性生效。默认的时间间隔为1分钟。debug:当此属性设置为true时,将打印出logback内部日志信息,实时查看logback运行状态。默认值为false。--><configuration scan="false" scanPeriod="60 seconds" debug="false"><!-- 定义日志的根目录 --><property name="LOG_HOME" value="/app/log" /><!-- 定义日志文件名称 --><property name="appName" value="netty"></property><!-- ch.qos.logback.core.ConsoleAppender 表示控制台输出 --><appender name="stdout" class="ch.qos.logback.core.ConsoleAppender"><Encoding>UTF-8</Encoding><!--日志输出格式:%d表示日期时间,%thread表示线程名,%-5level:级别从左显示5个字符宽度%logger{50} 表示logger名字最长50个字符,否则按照句点分割。 %msg:日志消息,%n是换行符--><layout class="ch.qos.logback.classic.PatternLayout"><pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n</pattern></layout></appender><!-- 滚动记录文件,先将日志记录到指定文件,当符合某个条件时,将日志记录到其他文件 --><appender name="appLogAppender" class="ch.qos.logback.core.rolling.RollingFileAppender"><Encoding>UTF-8</Encoding><!-- 指定日志文件的名称 --><file>${LOG_HOME}/${appName}.log</file><!--当发生滚动时,决定 RollingFileAppender 的行为,涉及文件移动和重命名TimeBasedRollingPolicy: 最常用的滚动策略,它根据时间来制定滚动策略,既负责滚动也负责出发滚动。--><rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy"><!--滚动时产生的文件的存放位置及文件名称 %d{yyyy-MM-dd}:按天进行日志滚动%i:当文件大小超过maxFileSize时,按照i进行文件滚动--><fileNamePattern>${LOG_HOME}/${appName}-%d{yyyy-MM-dd}-%i.log</fileNamePattern><!--可选节点,控制保留的归档文件的最大数量,超出数量就删除旧文件。假设设置每天滚动,且maxHistory是365,则只保存最近365天的文件,删除之前的旧文件。注意,删除旧文件是,那些为了归档而创建的目录也会被删除。--><MaxHistory>365</MaxHistory><!--当日志文件超过maxFileSize指定的大小是,根据上面提到的%i进行日志文件滚动 注意此处配置SizeBasedTriggeringPolicy是无法实现按文件大小进行滚动的,必须配置timeBasedFileNamingAndTriggeringPolicy--><timeBasedFileNamingAndTriggeringPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedFNATP"><maxFileSize>100MB</maxFileSize></timeBasedFileNamingAndTriggeringPolicy></rollingPolicy><!--日志输出格式:%d表示日期时间,%thread表示线程名,%-5level:级别从左显示5个字符宽度 %logger{50} 表示logger名字最长50个字符,否则按照句点分割。 %msg:日志消息,%n是换行符--><layout class="ch.qos.logback.classic.PatternLayout"><pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [ %thread ] - [ %-5level ] [ %logger{50} : %line ] - %msg%n</pattern></layout></appender><!--logger主要用于存放日志对象,也可以定义日志类型、级别name:表示匹配的logger类型前缀,也就是包的前半部分level:要记录的日志级别,包括 TRACE < DEBUG < INFO < WARN < ERRORadditivity:作用在于children-logger是否使用 rootLogger配置的appender进行输出,false:表示只用当前logger的appender-ref,true:表示当前logger的appender-ref和rootLogger的appender-ref都有效--><!-- hibernate logger --><logger name="org.hibernate" level="error" /><!-- Spring framework logger --><logger name="org.springframework" level="error" additivity="false"></logger><logger name="com.vic.study" level="info" additivity="true"><appender-ref ref="appLogAppender" /></logger><!--root与logger是父子关系,没有特别定义则默认为root,任何一个类只会和一个logger对应,要么是定义的logger,要么是root,判断的关键在于找到这个logger,然后判断这个logger的appender和level。--><root level="info"><appender-ref ref="stdout" /><appender-ref ref="appLogAppender" /></root></configuration>
最重要的为正则工具类
HTTP请求相关
import com.vic.study.crawl.util.IOUtils;/*** HTTP请求工具类* @author VIC**/public class CrawlConnect {private final Logger logger = LoggerFactory.getLogger(CrawlConnect.class);private Connection connection;/*** 必定先调用这个方法* @param url* @return*/public CrawlConnect(Connection connection){this.connection = connection;}public CrawlConnect url(String url){connection.url(url);return this;}public CrawlConnect url(URL url){connection.url(url);return this;}public CrawlConnect cookie(String name, String value){connection.cookie(name, value);return this;}public CrawlConnect cookie(Map<String, String> cookies){connection.cookies(cookies);return this;}public CrawlConnect data(String... keyvals){connection.data(keyvals);return this;}public CrawlConnect data(String key, String value ) {connection.data(key, value);return this;}public CrawlConnect data(String key, String filename, InputStream in){connection.data(key, filename, in);return this;}public Connection.Response execute() throws IOException{return connection.execute();}public CrawlConnect followRedirects(boolean followRedirects){connection.followRedirects(followRedirects);return this;}//getpublic Document getDocument() throws IOException {return connection.get();}public String getHtml() throws IOException{return this.getDocument().html();}public String getBodyText() throws IOException{return this.getDocument().body().text();}public CrawlConnect header(String key, String value) {connection.header(key, value);return this;}public CrawlConnect maxBodySize(int bytes) {connection.maxBodySize(bytes);return this;}public CrawlConnect method(Connection.Method method){connection.method(method);return this;}//Provide an alternate parser to use when parsing the response to a Document.public CrawlConnect parser(Parser parser){connection.parser(parser);return this;}//postpublic Document postDocument() throws IOException{return connection.post();}public String postHtml() throws IOException{return this.postDocument().html();}public String postBodyText() throws IOException{return this.postDocument().body().text();}//Sets the default post data character set for x-www-form-urlencoded post datapublic CrawlConnect postDataCharset(String charset) {connection.postDataCharset(charset);return this;}public Connection.Request request(){return connection.request();}public Connection.Response response(){return connection.response();}//Disable/enable TSL certificates validation for HTTPS requests.public CrawlConnect validateTLSCertificates(boolean value) {connection.validateTLSCertificates(value);return this;}/*** 下载文件到本地* @param path* @param fileName* @throws IOException*/public void downFile(String path, String fileName) throws IOException{Response response = this.execute();File file = IOUtils.getFileByPathAndName(path, fileName);FileOutputStream out = new FileOutputStream(file);out.write(response.bodyAsBytes());out.close();}}
爬虫类继承此抽象类,方便连接、操作网页
import org.jsoup.Connection;import org.jsoup.Jsoup;import com.vic.study.crawl.util.PropertiesUtil;/*** 爬虫基类* @author VIC**/public abstract class BaseCrawl {/*** 连接网页* @param url* @return*/protected CrawlConnect con(String url){Connection conn = Jsoup.connect(url).ignoreContentType(true).timeout(PropertiesUtil.getIntByKey("timeout"));return new CrawlConnect(conn);}}
根据分类抓取博客园下文章列表及详情及其他 写入本地文件
import java.io.File;import java.io.IOException;import java.util.Date;import java.util.List;import java.util.UUID;import org.apache.commons.io.FileUtils;import org.apache.commons.lang.StringUtils;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import com.vic.study.crawl.util.IOUtils;import com.vic.study.crawl.util.RegexUtil;/*** 抓取博客园的列表以及详情* @author VIC**/public class CrawlCnblogList extends BaseCrawl{private Logger logger = LoggerFactory.getLogger(CrawlCnblogList.class);private static final String SAVE_LOCATION = "E:/crawl/";//下载的保存位置private static final String baseUrl = "http://www.cnblogs.com/cate/";private static final String listRegex = "<div id=\"post_list\">(.*?)</div>\\s*<script>"; //列表页private static final String itemRegex = "<div class=\"post_item\">(.*?)<div class=\"clear\"></div>\\s*</div>"; //提取列表页的每个itemprivate static final String titleRegex = "<h3><a class=\"titlelnk\" href=\".*?\" target=\"_blank\">(.*?)</a></h3>";//标题private static final String linkRegex = "<h3><a class=\"titlelnk\" href=\"(.*?)\" target=\"_blank\">.*?</a></h3>";//链接private static final String descRegex = "<p class=\"post_item_summary\"> (.*?)</p>";//简介private static final String authorRegex = "<a href=\".*?\" class=\"lightblue\">(.*?)</a>";//作者private static final String timeRegex = "发布于 (.*?)<span class=\"article_comment\">";//发布时间private static final String detailRegex = "<div id=\"cnblogs_post_body\">(.*?)</div>\\s*<div id=\"MySignature\"></div>";//文章详情内容private static final String imgRegex = "<img .*? src=\"(.*?)\" .*?>";//详情页图片地址/*** 根据分类抓取博客园下文章列表及详情及其他 写入本地文件* @param categorys 如java/,php/* @throws IOException*/public void list(String...categories) {if(categories.length < 1) {logger.info("请输入至少一个分类");return;}for(String category : categories){String url = baseUrl + category +"/";try {String basePath = SAVE_LOCATION + category +"/" + new Date().getTime();//获取文章列表String html = con(url).getHtml();String itemsStr = RegexUtil.getFirstString(html, listRegex, 1);List<String> list = RegexUtil.getList(itemsStr, itemRegex, 1);for(String item : list) {String title = RegexUtil.getFirstString(item, titleRegex,1);String link = RegexUtil.getFirstString(item, linkRegex,1);String desc = RegexUtil.getFirstString(item, descRegex,1);String author = RegexUtil.getFirstString(item, authorRegex,1);String time = RegexUtil.getFirstString(item, timeRegex,1);StringBuffer baseContent = new StringBuffer();baseContent.append("标题:").append(title).append("\r\n");baseContent.append("链接:").append(link).append("\r\n");baseContent.append("作者:").append(author).append("\r\n");baseContent.append("时间:").append(time).append("\r\n");baseContent.append("简介:").append(desc).append("\r\n");String titleDir = StringUtils.isEmpty(title)? ("title" +UUID.randomUUID()) : title;String baseDir = basePath + File.separator + titleDir ;//每一篇文章存的目录String fileName = (StringUtils.isEmpty(title)? ("title" +UUID.randomUUID()) : title ) + ".txt";File baseFile = IOUtils.getFileByPathAndName(baseDir, fileName);FileUtils.write(baseFile, baseContent.toString(), "utf-8");//获取详情try{String detailPath = baseDir + "/detail/";String detail = con(link).getHtml();String content = RegexUtil.getFirstString(detail, detailRegex, 1);FileUtils.writeStringToFile(IOUtils.getFileByPathAndName(detailPath, fileName), content, "utf-8");List<String> imgs = RegexUtil.getList(content, imgRegex, 1);if(imgs.size() > 0) {for(String img : imgs){String imgName = img.substring(img.lastIndexOf("/")+ 1, img.length());try{con(img).downFile(detailPath, imgName);logger.warn("图片{}下载成功{}",imgName, img);}catch(Exception e) {logger.warn("图片{}下载失败",imgName);}}}}catch(Exception e){logger.warn("获得{}详情失败:{}", title, link);}}} catch (IOException e) {logger.warn("获得{}列表失败:{}", categories, url);e.printStackTrace();}}}public static void main(String[] args) {CrawlCnblogList l = new CrawlCnblogList();l.list("php", "java");}}
这是个垂直抓取的网络爬虫Demo,
package com.guoyuan.scene.database.test;import java.net.URL;import java.util.List;import com.gargoylesoftware.htmlunit.Page;import com.gargoylesoftware.htmlunit.WebClient;import com.gargoylesoftware.htmlunit.WebRequest;import com.gargoylesoftware.htmlunit.html.HtmlButton;import com.gargoylesoftware.htmlunit.html.HtmlDivision;import com.gargoylesoftware.htmlunit.html.HtmlImageInput;import com.gargoylesoftware.htmlunit.html.HtmlInput;import com.gargoylesoftware.htmlunit.html.HtmlPage;public class HttpTest {public static void main(String[] args) throws Exception {String url = "http://oa.gyzq.com.cn:8082/welcome.do";// 想采集的网址String refer = "http://outofmemory.cn/";URL link = new URL(url);WebClient wc = new WebClient();WebRequest request = new WebRequest(link);request.setCharset("UTF-8");request.setAdditionalHeader("Referer", refer);// 设置请求报文头里的refer字段// //设置请求报文头里的User-Agent字段request.setAdditionalHeader("User-Agent","Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");// wc.addRequestHeader("User-Agent",// "Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");// wc.addRequestHeader和request.setAdditionalHeader功能应该是一样的。选择一个即可。// 其他报文头字段可以根据需要添加wc.getCookieManager().setCookiesEnabled(true);// 开启cookie管理wc.getOptions().setJavaScriptEnabled(true);// 开启js解析。对于变态网页,这个是必须的wc.getOptions().setCssEnabled(true);// 开启css解析。对于变态网页,这个是必须的。wc.getOptions().setThrowExceptionOnFailingStatusCode(false);wc.getOptions().setThrowExceptionOnScriptError(false);wc.getOptions().setTimeout(10000);// 设置cookie。如果你有cookie,可以在这里设置// Set<Cookie> cookies = null;// Iterator<Cookie> i = cookies.iterator();// while (i.hasNext())// {// wc.getCookieManager().addCookie(i.next());// }// 准备工作已经做好了HtmlPage page = null;page = wc.getPage(request);if (page == null) {System.out.println("采集 " + url + " 失败!!!");return;}String content = page.asXml();// 网页内容保存在content里if (content == null) {System.out.println("采集 " + url + " 失败!!!");return;}List<?> inputList = page.getByXPath("//input[@name='image']");HtmlImageInput image = (HtmlImageInput)inputList.get(0);HtmlInput userid = (HtmlInput)page.getHtmlElementById("userId");HtmlInput pass = (HtmlInput)page.getHtmlElementById("pass");userid.setValueAttribute("0254");pass.setValueAttribute("password123");image.click();request = new WebRequest(new URL("http://oa.gyzq.com.cn:8082/UIProcessor?Table=vGYMS_JBSQ"));request.setCharset("UTF-8");request.setAdditionalHeader("Referer", refer);// 设置请求报文头里的refer字段// //设置请求报文头里的User-Agent字段request.setAdditionalHeader("User-Agent","Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");HtmlPage page2 = wc.getPage(request);List<?> buttonList = page2.getElementsByTagName("button");List<?> divList = page2.getByXPath("//div[contains(@class,'x-grid3-row')]");for (int i = 0; i < buttonList.size(); i++) {HtmlButton button = (HtmlButton)buttonList.get(i);System.out.println(button.getTextContent());}HtmlButton button = (HtmlButton)page2.getElementById("ext-gen39");for (int i = 0; i < divList.size(); i++) {HtmlDivision div = (HtmlDivision)divList.get(i);div.click();if (button != null) {Page popup = button.click();if (popup.isHtmlPage()) {System.out.println(((HtmlPage) popup).asXml());}}System.out.println(div.asXml());}// Page page2 = image.click();// if (page2.isHtmlPage()) {// HtmlPage htmlPage = (HtmlPage) page2;// List<FrameWindow> iframeList = htmlPage.getFrames();// for (FrameWindow frame : iframeList) {// if (frame.getName().equals("fraRightFrame")) {// final HtmlPage pageTwo = (HtmlPage) frame.getEnclosedPage();// HtmlAnchor anchor = pageTwo.getAnchorByText("陈洋于2015-08-24发起的发文申请流程");// HtmlPage page3 = anchor.click();// System.out.println(page3.asXml());// System.out.println(page3.asXml());// }// }// }// // 搞定了// CookieManager CM = wc.getCookieManager(); // WC = Your WebClient's name// Set<Cookie> cookies_ret = CM.getCookies();// 返回的Cookie在这里,下次请求的时候可能可以用上啦。// System.out.println(123);}}
package com.vic.study.crawl.erp;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStream;import java.net.URLDecoder;import java.util.Date;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.UUID;import org.apache.commons.lang.time.DateFormatUtils;import org.apache.commons.lang.time.DateUtils;import org.jsoup.Connection.Method;import org.jsoup.Connection.Response;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import com.vic.study.crawl.BaseCrawl;import com.vic.study.crawl.util.ERPContants;import com.vic.study.crawl.util.RegexUtil;/*** 抓取ERP 自动提交加班流程* @author VIC**/public class ErpCrawl extends BaseCrawl{public static void printMap(Map<String, String> map){System.out.println("打印map start--------------->>");for(Map.Entry<String, String> entry : map.entrySet()) {System.out.println(entry.getKey() + "—" + entry.getValue());}System.out.println("打印map end<<---------------------");}/*** 登陆* @return*/public Map<String, String> login(){try {Map<String, String> cookies = con(ERPContants.LOGIN_URL).data("userId", ERPContants.USERNAME).data("pass", ERPContants.PASSWORD).execute().cookies();if(cookies != null) {Document docemnet = con(ERPContants.HOME_PAGE_URL).cookie(cookies).getDocument();Element userNameEle=docemnet.getElementById("userNameCt");if(userNameEle!=null){logger.warn("{} 登陆成功" , ERPContants.USERNAME);return cookies;}else{logger.warn("{}登陆失败" , ERPContants.USERNAME);}}} catch (IOException e) {logger.warn("{} 登陆失败",ERPContants.USERNAME);}return null;}/*** 2 加班流程*/public void loadOvertimeProcess(){Map<String, String> cookies = login();if(cookies == null){return ;}String processNo = "";//流程编号try {String html = con(ERPContants.OVERTIME_PROCESS_URL).cookie(cookies).getHtml();String token = RegexUtil.getFirstString(html, "Token=(.*?)&", 1);String workId = RegexUtil.getFirstString(html, "&WorkID=(\\d+?)&", 1);String StepID = "initialStep";String WorkActionID = RegexUtil.getFirstString(html, "&WorkActionID=(\\d+?)&", 1);logger.info("进入加班流程页面 token:{},workID:{},WorkActionID:{}", token,workId, WorkActionID);processNo = RegexUtil.getFirstString(html, "提出的加班申请\\[(\\d+?)\\]", 1);/*** 添加加班详细信息*/goAddDetail(cookies, token, workId);/*** 添加附加*/goAddAttachment(cookies, token, workId);// if(1==1) return;// String fromHtml = RegexUtil.getFirstString(html, "<form .*?>(.*?)</form>", 1);//提取form// List<String> nameList = RegexUtil.getList(fromHtml, "(<.*? name=.*?>)", 1);//提取所有的带有name的标签//提取结果为 需要提交的参数Map<String, String> data = new HashMap<>();data.put("$C{LCBT}", "0");//流程标题data.put("$C{JBXXXX}", "12");data.put("$C{JBNR}", "7");//加班内容data.put("JBNR", "具体的加班内容");// 具体的加班内容 手动填写data.put("$C{FJ}", "12");//附件data.put("$C{SPYJ}", "0");//审批意见//data.put("SPYJ", "");//审批意见data.put("$C{CSR}", "25");//抄送信息//data.put("CSR", "");data.put("$C{SQR}", "0");//申请人data.put("SQR", "60");//申请人 需/可 抓取data.put("$C{SQRBM}", "0");//申请人部门data.put("SQRBM", "142");//申请人部门 需/可 抓取data.put("$C{SQSJ}", "0");//申请时间data.put("SQSJ", "2015-11-18 15:09:44");//申请时间// 保存时通用参数 Token WorkID StepIDdata.put("WorkID", workId);data.put("StepID", StepID);data.put("Token", token);data.put("WorkActionID", WorkActionID);//抓取相关参数String SQRBM = RegexUtil.getFirstString(html, "name=\"SQRBM\".*?value=\"(\\d*?)\".*?>", 1);//申请人部门data.put("SQRBM", SQRBM);String SQR = RegexUtil.getFirstString(html, "name=\"SQR\".*?value=\"(\\d*?)\".*?>", 1);//申请人;data.put("SQR", SQR);//手动添加加班内容data.put("JBNR", "这是程序添加的测试加班内容" + UUID.randomUUID() + " 时间:" + DateFormatUtils.format(new Date(), "yyyy-MM-dd HH:mm"));// 提交加班流程String resultHtml = con(ERPContants.SUBMIT_OVERTIME_PROCESS_URL).cookie(cookies).data(data).postHtml();if(resultHtml.contains("执行提交成功")){logger.info("本次成功提交的流程编号为:{}" , RegexUtil.getFirstString(resultHtml, "提出的加班申请\\[(\\d*?)\\]", 1));System.out.println("processNo:" + processNo);}} catch (IOException e) {logger.warn("OPEN加班流程{}失败" , ERPContants.OVERTIME_PROCESS_URL);}}/*** 3 前往新增加班流程明细 并提交* 当前token workID 与 “2 加班流程”中的一致 尚未改变*/public void goAddDetail(Map<String, String> cookies, String token, String workID){try {String detailHtml = con(ERPContants.ADD_OVERTIME_PROCESS_DETAIL_URL).cookie(cookies).data("Token", token).data("WorkID", workID).getHtml();String operateID = RegexUtil.getFirstString(detailHtml, ";OperateID=(.*?)&", 1);logger.info("前往新增加班流程明细 token:{},workID:{},operateID:{}", token,workID, operateID);Map<String, String> params = new HashMap<>();params.put("OperateID", operateID);// params.put("Token", token);params.put("WorkID", workID);params.put("XM", "60");//姓名params.put("$C{XM}", "0");params.put("BM", "盐城解放南路证券营业部");//部门params.put("KSSJ", DateFormatUtils.format(DateUtils.addHours(new Date(), 6) ,ERPContants.TIME_PATTERN));//加班开始时间params.put("JSSJ", DateFormatUtils.format(DateUtils.addHours(new Date(), 8) ,ERPContants.TIME_PATTERN));//加班结束时间params.put("JBSJ", "2.0");//加班时间(小时)params.put("TXHBZ", "1");//调休或补助 0 -1try{String html = con(ERPContants.SUBMIT_OVERTIME_PROCESS_DETAIL_URL).cookie(cookies).data(params).postHtml();if(html.contains("执行新增成功")){logger.info("成功新增加班流程明细");}}catch(Exception e){logger.warn("提交加班明细{}失败" , ERPContants.SUBMIT_OVERTIME_PROCESS_DETAIL_URL);}} catch (IOException e) {e.printStackTrace();logger.warn("前往加班明细{}失败" , ERPContants.ADD_OVERTIME_PROCESS_DETAIL_URL);}}/*** 4 前往添加附件* 当前token workID 与传入的token workId尚且一致* operateID 和步骤3中的 operateID 不一致*/public void goAddAttachment(Map<String, String> cookies, String token, String workID){InputStream in = null;try {// String attachmentHtml = con(ERPContants.ADD_ATTACHMENT_URL).cookie(cookies).data("Token", token).data("WorkID", workID).getHtml();Response attachmentResponse = con(ERPContants.ADD_ATTACHMENT_URL).cookie(cookies).data("Token", token).data("WorkID", workID).execute();String refererUrl = attachmentResponse.url().toString();String attachmentHtml = attachmentResponse.parse().html();String operateID = RegexUtil.getFirstString(attachmentHtml, ";OperateID=(.*?)&", 1);logger.info("前往添加附件 token:{},workID:{},operateID:{}", token,workID, operateID);Map<String, String> params = new HashMap<>();//参数 文件参数为 FJparams.put("OperateID", operateID);params.put("Token", token);params.put("WorkID", workID);params.put("operate", "Add");/*params.put("$C{FJMC}", "1");params.put("$C{FJ}", "9");params.put("$C{BZ}", "7");*/params.put("BZ", "这是一个附件备注" +UUID.randomUUID());File file = new File("E:\\pic\\4.jpg");if(!file.exists()){logger.warn("不存在的附件");return ;}params.put("FJMC", file.getName());//附件名称in = new FileInputStream(file);Response response = con(ERPContants.SUBMIT_ATTACHMENT_URL).header("Content-Type", "multipart/form-data").cookie(cookies).data(params).data("FJ", file.getName(), in).setMethod(Method.POST).execute();String result = response.parse().html();System.out.println("ADD RESULT");System.out.println(result);// if(response.body().contains("执行新增成功")){// logger.info("提交附件成功");// logger.info("返回提交附件的结果为\n{{}}" ,RegexUtil.getFirstString(result, "var params =\\{(.*?)\\};", 1));// }/** var params ={"message":"执行新增成功.","data":{ "results": 2, "records": [{"id":"-1001","FJMC":"123请问","FJ":"3.jpg","BZ":""},{"id":"-1002","FJMC":"额外发","FJ":"3.jpg","BZ":"阿萨德"}]},"retVal":true,"success":true};* */} catch (IOException e) {e.printStackTrace();logger.info("添加附件失败");}finally {try {in.close();} catch (IOException e) {// TODO Auto-generated catch blocke.printStackTrace();}}}/********************************************************************/public static void main(String[] args) {ErpCrawl e = new ErpCrawl();// e.testupload();e.loadOvertimeProcess();// testUrl();}/********************************************************************/public void testupload(){File file = new File("E:\\pic\\4.jpg");if(!file.exists()){logger.warn("不存在的附件");return ;}try{Response response = con("http://121.41.76.50/upfile/ajax/upfile").data("upfile", file.getName(), new FileInputStream(file)).setMethod(Method.POST).execute();System.out.println(response.body());System.out.println(response.statusCode());}catch(IOException e){e.printStackTrace();}}public static void testUrl(){String url="http://oa.gyzq.com.cn:8082/OperateProcessor?operate=WorkSave&Table=wfGYMS_JBSQ&Token=c98007f224b69d598b13f30e27b217ff&WorkID=2577&StepID=4&&isSubmit=1&%24C%7BCSR%7D=25&%24C%7BFJ%7D=12&%24C%7BJBNR%7D=7&%24C%7BJBXXXX%7D=12&%24C%7BLCBT%7D=0&%24C%7BSPYJ%7D=0&%24C%7BSQRBM%7D=0&%24C%7BSQR%7D=0&%24C%7BSQSJ%7D=0&CSR=&JBNR=0002&SPYJ=&SQR=60&SQRBM=142&SQSJ=2015-11-18%2013%3A19%3A29&operate=Update";List<String[]> list = RegexUtil.getList(url, "&(.*?)=(.*?)&", new int[]{1,2});for(String us[] : list) {System.out.println(us[0] + " || " + URLDecoder.decode(us[0]) + " | " +us[1]);}System.out.println("00000000");//解析提交流程的URL $C{JBXXXX}==加班详细信息 $C{FJ}==附件 $C{SPYJ}==审批意见=0 $C{SQR}==申请人=0 SQRBM==申请人部门=142 WorkIDString url2 = "http://oa.gyzq.com.cn:8082/OperateProcessor?operate=WorkSave&Table=wfGYMS_JBSQ&Token=1cc072aef0e222b263d9c0e470dd8b1e&WorkID=40&StepID=initialStep&&isSubmit=1&%24C%7BCSR%7D=25&%24C%7BFJ%7D=12&%24C%7BJBNR%7D=7&%24C%7BJBXXXX%7D=12&%24C%7BLCBT%7D=0&%24C%7BSPYJ%7D=0&%24C%7BSQRBM%7D=0&%24C%7BSQR%7D=0&%24C%7BSQSJ%7D=0&CSR=&JBNR=%E6%B5%8B%E8%AF%95%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%82%E3%80%822015%E5%B9%B411%E6%9C%8818%E6%97%A514%3A53%3A48&SPYJ=&SQR=60&SQRBM=142&SQSJ=2015-11-18%2014%3A50%3A51&operate=Add";List<String[]> list2 = RegexUtil.getList(url2, "&(.*?)=(.*?)&", new int[]{1,2});for(String us[] : list2) {System.out.println(us[0] + " || " + URLDecoder.decode(us[0]) + " | " +us[1]);}String url5 = "&%24C%7BCSR%7D=25&%24C%7BFJ%7D=12&%24C%7BJBNR%7D=7&%24C%7BJBXXXX%7D=12&%24C%7BLCBT%7D=0&%24C%7BSPYJ%7D=0&%24C%7BSQRBM%7D=0&%24C%7BSQR%7D=0&%24C%7BSQSJ%7D=0&CSR=&JBNR=%E9%98%BF%E8%90%A8%E5%BE%B7%E9%98%BF%E8%90%A8%E5%BE%B7%E6%B5%8B%E8%AF%95%20%E6%B5%8B%E8%AF%95%20%E6%B5%8B%E8%AF%95&SPYJ=&SQR=60&SQRBM=142&SQSJ=2015-11-19%2011%3A07%3A59&";List<String[]> list5 = RegexUtil.getList(url5, "&(.*?)=(.*?)&", new int[]{1,2});for(String us[] : list5) {System.out.println(us[0] + " || " + URLDecoder.decode(us[0]) + " | " +us[1]);}}}
package com.vic.study.crawl.util;public class ERPContants {public static final String TIME_PATTERN = "yyyy-MM-dd HH:mm";public static final String USERNAME = "0254";public static final String PASSWORD = "password123";public static final String BASE_URL = "http://oa.gyzq.com.cn:8082";// 网址public static final String LOGIN_URL = "http://oa.gyzq.com.cn:8082/login.do";// 登陆public static final String HOME_PAGE_URL = "http://oa.gyzq.com.cn:8082/welcome.do";// 主页//加班流程public static final String OVERTIME_PROCESS_URL = "http://oa.gyzq.com.cn:8082/StartWorkflow?Workflow=wfGYMS_JBSQ&extWindow=true&PopupWin=true";//保存加班流程 Token WorkID StepIDpublic static final String SAVE_OVERTIME_PROCESS_URL = "http://oa.gyzq.com.cn:8082/OperateProcessor?operate=WorkSave&Table=wfGYMS_JBSQ&isSubmit=1&operate=Update";//提交加班流程 WorkActionID Token WorkIDpublic static final String SUBMIT_OVERTIME_PROCESS_URL = "http://oa.gyzq.com.cn:8082/OperateProcessor?operate=WorkAction.5&Table=wfGYMS_JBSQ&StepID=initialStep&&isSubmit=1&operate=Add";// Token WorkID 前往新增详细public static final String ADD_OVERTIME_PROCESS_DETAIL_URL = "http://oa.gyzq.com.cn:8082/WorkProcessor?Table=wfGYMS_JBSQ&StepID=initialStep&Column=JBXXXX&SubmitForm=DATA_FORM&EVENT_SOURCE=AddHypotObj&extWindow=true&PopupWin=true";// 提交详细 Token WorkID OperateIDpublic static final String SUBMIT_OVERTIME_PROCESS_DETAIL_URL = "http://oa.gyzq.com.cn:8082/OperateProcessor?Table=wfGYMS_JBSQ&StepID=initialStep&extWindow=true&extWindow=true&PopupWin=true&NewWindow=false";//打开提交附件 Token WorkIDpublic static final String ADD_ATTACHMENT_URL = "http://oa.gyzq.com.cn:8082/WorkProcessor?Table=wfGYMS_JBSQ&StepID=initialStep&Column=FJ&SubmitForm=DATA_FORM&EVENT_SOURCE=AddHypotObj&extWindow=true&PopupWin=true";//提交附件 OperateID WorkID Tokenpublic static final String SUBMIT_ATTACHMENT_URL = "http://oa.gyzq.com.cn:8082/OperateProcessor?Table=wfGYMS_JBSQ&StepID=7&extWindow=true&PopupWin=true&NewWindow=false";// /OperateProcessor?Table=wfGYMS_JBSQ&Token=c04a7c4eca425b738e8686487351af6f&WorkID=202&StepID=initialStep&OperateID=34b53c980468fe3fc10140f46acf28e9&extWindow=true}