[关闭]
@ruoli 2016-10-09T17:19:45.000000Z 字数 2535 阅读 1662

使用Jsoup开发简易网页爬虫程序

Java基础


mavan:

  1. <dependency>
  2. <groupId>org.jsoup</groupId>
  3. <artifactId>jsoup</artifactId>
  4. <version>1.8.1</version>
  5. </dependency>
  1. URL url = new URL("https://www.zybuluo.com/ruoli/note/483730");
  2. Document document=HTTPCommonUtil.getHttpsDocument(url, 5000);
  3. String content=document.select("div#editor-reader-full").toString();

HTTPCommonUtil.java

  1. package tools;
  2. import java.net.MalformedURLException;
  3. import java.net.URL;
  4. import java.security.SecureRandom;
  5. import java.security.cert.CertificateException;
  6. import java.security.cert.X509Certificate;
  7. import java.util.Map;
  8. import javax.net.ssl.HostnameVerifier;
  9. import javax.net.ssl.HttpsURLConnection;
  10. import javax.net.ssl.SSLContext;
  11. import javax.net.ssl.SSLSession;
  12. import javax.net.ssl.X509TrustManager;
  13. import org.jsoup.Connection;
  14. import org.jsoup.helper.HttpConnection;
  15. import org.jsoup.nodes.Document;
  16. public class HTTPCommonUtil {
  17. public static void trustEveryone() {
  18. try {
  19. HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {
  20. public boolean verify(String hostname, SSLSession session) {
  21. return true;
  22. }
  23. });
  24. SSLContext context = SSLContext.getInstance("TLS");
  25. context.init(null, new X509TrustManager[] { new X509TrustManager() {
  26. public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
  27. }
  28. public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
  29. }
  30. public X509Certificate[] getAcceptedIssuers() {
  31. return new X509Certificate[0];
  32. }
  33. } }, new SecureRandom());
  34. HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());
  35. } catch (Exception e) {
  36. // e.printStackTrace();
  37. }
  38. }
  39. public static Object getHttpHeaders(URL url, int timeout) {
  40. try {
  41. trustEveryone();
  42. Connection conn = HttpConnection.connect(url);
  43. conn.timeout(timeout);
  44. conn.header("Accept-Encoding", "gzip,deflate,sdch");
  45. conn.header("Connection", "close");
  46. conn.get();
  47. Map<String, String> result = conn.response().headers();
  48. result.put("title", conn.response().parse().title());
  49. return result;
  50. } catch (Exception e) {
  51. //e.printStackTrace();
  52. }
  53. return null;
  54. }
  55. public static Document getHttpsDocument(URL url, int timeout){
  56. try {
  57. trustEveryone();
  58. Connection conn = HttpConnection.connect(url);
  59. conn.timeout(timeout);
  60. conn.header("Accept-Encoding", "gzip,deflate,sdch");
  61. conn.header("Connection", "close");
  62. Document document=conn.get();
  63. return document;
  64. } catch (Exception e) {
  65. e.printStackTrace();
  66. }
  67. return null;
  68. }
  69. public static void main(String[] args) {
  70. try {
  71. URL url = new URL("https://www.zybuluo.com/ruoli/note/483730");
  72. Document document=HTTPCommonUtil.getHttpsDocument(url, 5000);
  73. String title = document.title();
  74. System.out.println(document.toString());
  75. } catch (MalformedURLException e) {
  76. e.printStackTrace();
  77. }
  78. }
  79. }
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注