From ce320f00367a0eca3a4ca4e3564ae8816f9cef18 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 27 Aug 2021 12:32:11 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Book | 1 + pom.xml | 42 +++++ src/main/java/Main.java | 9 ++ src/main/java/Service/HTMLParseUtils.java | 165 ++++++++++++++++++++ src/main/java/Service/JsoupUtils.java | 151 ++++++++++++++++++ target/classes/Main.class | Bin 0 -> 550 bytes target/classes/Service/HTMLParseUtils.class | Bin 0 -> 4472 bytes target/classes/Service/JsoupUtils.class | Bin 0 -> 6769 bytes 8 files changed, 368 insertions(+) create mode 160000 Book create mode 100644 pom.xml create mode 100644 src/main/java/Main.java create mode 100644 src/main/java/Service/HTMLParseUtils.java create mode 100644 src/main/java/Service/JsoupUtils.java create mode 100644 target/classes/Main.class create mode 100644 target/classes/Service/HTMLParseUtils.class create mode 100644 target/classes/Service/JsoupUtils.class diff --git a/Book b/Book new file mode 160000 index 0000000..5f0130b --- /dev/null +++ b/Book @@ -0,0 +1 @@ +Subproject commit 5f0130b02d8d9a26532acd6c8dd79b43a957740a diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..f662418 --- /dev/null +++ b/pom.xml @@ -0,0 +1,42 @@ + + + 4.0.0 + + org.example + FindBook + 1.0-SNAPSHOT + + + + org.apache.maven.plugins + maven-compiler-plugin + + 8 + 8 + + + + + + + us.codecraft + webmagic-core + 0.7.3 + + + + us.codecraft + webmagic-extension + 0.7.3 + + + + com.squareup.okhttp3 + okhttp + 4.9.1 + + + + \ No newline at end of file diff --git a/src/main/java/Main.java b/src/main/java/Main.java new file mode 100644 index 0000000..28a556e --- /dev/null +++ b/src/main/java/Main.java @@ -0,0 +1,9 @@ +import Service.JsoupUtils; + +import java.io.IOException; + +public class Main { + public static void main(String[] args) throws IOException { + new JsoupUtils().getBook(1,3,"https://book.qidian.com/info/1029575290/#Catalog"); + } +} diff --git a/src/main/java/Service/HTMLParseUtils.java b/src/main/java/Service/HTMLParseUtils.java new file mode 100644 index 0000000..2914ea3 --- /dev/null +++ b/src/main/java/Service/HTMLParseUtils.java @@ -0,0 +1,165 @@ +package Service; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.Selectable; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.List; + +public class HTMLParseUtils implements PageProcessor { + + private Site site = Site.me() + + .setCharset("utf-8") + + .setTimeOut(1000) + + .setSleepTime(1000); + + String bookName1 = ""; + + + @Override + + public Site getSite() {return site;} + + //爬取数据逻辑 + + //第一级URL https://www.qidian.com/xuanhuan 获取书栏目录 + + //第二级 https://book.qidian.com/info/1019251979#Catalog 章节目录 + + //第三级 https://read.qidian.com/chapter/SaT8jsiJD54smgY_yC2imA2/oQbX6YtwB_NOBDFlr9quQA2 章节内容 + + @Override + + public void process(Page page) { + + //获取URL + + Selectable table = page.getUrl(); + + //URL匹配 用.{23}去代替字符匹配,每个章节的后缀不一样 + + if (table.regex("https://read.qidian.com/chapter/.{23}/.{23}").match()) {//文章章节页面 + + Html html = page.getHtml(); + String title = ""; + List content = new ArrayList(); + //判断是否是第一章 + + if (html.xpath("/html/body/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/h1/text()").toString() != null) {//是第一章 + + //获取书名 + + bookName1 = html.xpath("/html/body/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/h1/text()").toString(); + + //System.out.println(bookName); + + //获取章节名 + + title = html.xpath("[@class='main-text-wrap']/div[1]/h3/span/text()").toString(); + + //System.out.println(title); + + //获取文章内容 + + content = html.xpath("[@class='main-text-wrap']/div[2]/p/text()").all(); + + } else {//不是第一章 + + //获取章节名 + + title = html.xpath("[@id='j_chapterBox']/div[1]/div[1]/div[1]/h3/span/text()").toString(); + + //获取文章内容 + + content = html.xpath("[@id='j_chapterBox']/div[1]/div[1]/div[2]/p/text()").all(); + + } + + //存到本地 + + downBook(bookName1, title, content); + + }else if(table.regex("https://book.qidian.com/info/\\d{10}#Catalog").match()){//书的章节目录 + + //获取每一章节的地址,在章节目录里每一章的xpath + + List url = page.getHtml().xpath("[@class='volume-wrap']/div[1]/ul/li/a/@href").all(); + + //加入待爬取序列 + + page.addTargetRequests(url); + + }else{//一级url + + Html html = page.getHtml(); + + List url = html.xpath("[@id='new-book-list']/div/ul/li/div[2]/h4/a/@href").all(); + + List url2 = new ArrayList(); + + for (String string : url) { + + url2.add(string + "#Catalog"); + + } + + //加入待爬取序列 + + page.addTargetRequests(url2); + + } + + } + + //将书存入本地 + + private void downBook(String bookName2, String title, List content) { + + //判断目录存不存在 + + File file = new File("D:/book.qidian/" + bookName2); + + if(!file.exists()){ + + file.mkdirs(); + + } + + PrintWriter pw = null; //使用IO流 + + try { + + FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + bookName2 + "/" + title + ".txt"); + + pw = new PrintWriter(fos,true); + + for (String string : content) { + + pw.println(string); + + } + + System.out.println(title + " " + "爬取完毕"); + + } catch (FileNotFoundException e) { + + e.printStackTrace(); + + } finally {//关流 + + pw.close(); + + } + + } +} \ No newline at end of file diff --git a/src/main/java/Service/JsoupUtils.java b/src/main/java/Service/JsoupUtils.java new file mode 100644 index 0000000..f9f5742 --- /dev/null +++ b/src/main/java/Service/JsoupUtils.java @@ -0,0 +1,151 @@ +package Service; + + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; + +import java.io.*; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +public class JsoupUtils { + public void getBook(String url, int section) throws IOException { + if (Jsoup.connect(url).execute().statusCode() == 200) { + Document doc = Jsoup.connect(url).get(); + String bookName = doc.title(); + Elements links = doc.select("#j-catalogWrap > div.volume-wrap"); + Elements contentUrl = links.select("a"); + List urls = contentUrl.stream().map(element -> element.attr("abs:href")).collect(Collectors.toList()); + List titles = new ArrayList<>(); + List words_number = new ArrayList<>(); + for (String a : urls) { + if (a.contains("vipreader")) { + System.out.println("剩下的是vip章节了"); + break; + } else if (a.contains("chapter")) { + Document mo = Jsoup.connect(a).get(); + String id = mo.getElementsByClass("text-wrap").attr("id"); + System.out.println(id); + Elements title = mo.select("#" + id + " > div > div.text-head > h3 > span.content-wrap"); + Elements content = mo.select("#" + id + " > div > div.read-content.j_readContent"); + String tit = title.text(); + String con = content.text(); + if (!tit.equals("")) { + titles.add(tit); + words_number.add(con.length()); + } else { + continue; + } + downBook(bookName, tit, con, section); + + } else { + section++; + } + + } + if (!titles.isEmpty()) { + conclusion(bookName,titles,words_number); + } + } else { + return; + } + + } + + //获得从x卷到y卷 + public void getBook(int start, int end, String url) throws IOException { + Document doc = Jsoup.connect(url).get(); + String bookName = doc.title(); + Elements links = doc.select("#j-catalogWrap > div.volume-wrap"); + Elements contentUrl = links.select("a"); + List urls = contentUrl.stream().map(element -> element.attr("abs:href")).collect(Collectors.toList()); + List titles = new ArrayList<>(); + List words_number = new ArrayList<>(); + for (String a : urls) { + if (a.contains("chapter")) { + Document mo = Jsoup.connect(a).get(); + String id = mo.getElementsByClass("text-wrap").attr("id"); + System.out.println(id); + Elements title = mo.select("#" + id + " > div > div.text-head > h3 > span.content-wrap"); + Elements content = mo.select("#" + id + " > div > div.read-content.j_readContent"); + String tit = title.text(); + String con = content.text(); + if (!tit.equals("")) { + titles.add(tit); + words_number.add(con.length()); + } else { + continue; + } + + downBook(bookName, tit, con, start - 1); + + } else { + start++; + if (start == end) { + conclusion(bookName,titles,words_number); + return; + } + } + + } + } + + private static void downBook(String bookName2, String title, String content, int section) { + + File file = new File("D:/book.qidian/" + "#" + bookName2); + + if (!file.exists()) { + + //如果不存在目录,则创建目录 + file.mkdirs(); + } + + PrintWriter pw = null; + try { + + FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + "#" + bookName2 + "/" + "第" + section + "卷" + title + ".txt"); + + pw = new PrintWriter(fos, true); + + pw.println(content); + + System.out.println(title + " " + "爬取完毕"); + + } catch (FileNotFoundException e) { + e.printStackTrace(); + return; + + } finally {//关流 + pw.close(); + } + } + + private static void conclusion(String bookName2, List titles, List words_number) { + + File file = new File("D:/book.qidian/" + "#" + bookName2); + + PrintWriter pw = null; + try { + + FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + "#" + bookName2 + "/" + "conclusion" + ".txt"); + + pw = new PrintWriter(fos, true); + + for (int i = 0; i < titles.size(); i++) { + pw.println(titles.get(i)); + pw.println(words_number.get(i)); + } + + + } catch (FileNotFoundException e) { + e.printStackTrace(); + return; + + } finally {//关流 + pw.close(); + } + } +} + diff --git a/target/classes/Main.class b/target/classes/Main.class new file mode 100644 index 0000000000000000000000000000000000000000..2aa1402ebfed54a10dff5e24ad9c78a21a5bce5c GIT binary patch literal 550 zcmZuu%T59@6g|a(5kygZ;QJ9b=z^k=7@rG`CK%KOYGPcKv5wVYibFB}mK#Yl@dNxQ z<1G@|*rdJpp4*;t+J1e%e*)ORVhS#DE+$imA=i&78>d~&xR_-~?5a?ieTKgL$`yli zsBa}hrlLZ5)~eTJbRlX1A=!%dL~tb{W$P~Jn0wVEj>@S};SQayQ)HOP-&7vNv*3XU zeO@(@3jLi9yNG<+Kl)FbA$9cP$%aunY`U1Ex~f}|Cy$ktPg;e7B~7CrN%B*b(X;X- zKWXY#rh>Q2eG@9F-*R*~tJgHkH!h)ykT!nX@m2$GzTQXkjt0L|J@Z!NYJy}5d*W9X-!eZDKd)j z)i&5`r#(Px;ul1k@^m*J#1K_UWH3x5_V6B@PA~I;*mWCyP9p2PA>O00UklP5BE=q6 T1|!H4K1$OPjL{sYJcfxMaMf?2 literal 0 HcmV?d00001 diff --git a/target/classes/Service/HTMLParseUtils.class b/target/classes/Service/HTMLParseUtils.class new file mode 100644 index 0000000000000000000000000000000000000000..ccbc126cc0c149b2bb69642311c77e6fb07c45a3 GIT binary patch literal 4472 zcmbVQdw3Mp75`1LyE9pa*Yc31yaY54*bLzzbOVavQNR$W2~ZwZCcBgD!0yg6JDUV4 zZM9ZWtXiwKP}@>Un@TB)%0f)5wp47bweOdG*V6X;{I`Gj`Sp|j?#yPhOA=}O<>TJD z_nvd^dHl{fcX{>Q=U)J@7?*-*L{|{q=uxm=%y)?4fEYefg(wax_^6ma7R1MKr-Hji z;S;!944)J`_sF<6h_UEWa43lTaKAV?tl$9!pAw%BDtJi2!$C^;w1P(zd`56Rs^GCv zcpRS{h0o!L7>>&Lyo@J;m@TxKjV~zpqL>d0vMX$fG){`0r(`@G#4?IX)Y%|?Sg1~Y!Z8|qf zsH~~oN~x*%ToLuhH{h0y3&b>mrcD+a>6*r?vM!$-0VLc&` zb2=i+RD2sRsCW@CsrZf)5wEEDK7K%T+(dRmbhB;i`vrm}tkFbPXzf;fzZN&U zb}qclofdghVb$QRhD?bD%`tkMnp&bq#jE1JAL2C$+Rl}+q@K;Ls7~o-IwEKy-L{^o zF3L$P(y|#nU0|0```#SXBU7MN@gux0VcE`=W_(5Up4$u7H(5P{(uE;|%3gWb!vB)B z?9z6`N+=%DBP>xuljfHC9ybz+q;sHh(=T-a^KV#ae__+x7FYrs2sw=Cugln$c zYRZ|(xM8dK6@JZ>A5dJyZ}3|Qs!Lu-TNd}#QG%pswI+c4hKk>b8-I^iSkFu8OVgZL zt}$=xogTu?10kiBB&ZPe2Ni$B8#4Z+;?MXC^Ly5;e;hh>>7mCjoqpi&Pdz5%uPWX| zlZ5G|*9<8ZD*h&jCJ*0Q%G|Q4eGkjMgvju%u+Uf74n(!y>~$H-W<498UQiLn!ygt~ z%U~s!Fy|_?q7BgzuD!*=%APv7v7TA4vttdo)os#k3ES%S8p{Z(dabo0MSX)=!&#LO zy_cR?Fia`DO8XW3ZDvZ8bUxi#Hd27-zGJ=O~lYM@R=%0}h+x#$|W3l(!MyJsu z;imsV;}ImKbSIW@A2K_Z@xs@J#5*=nf?)j^(|NBupf;8>RABQ>C7yt)YR# zVoYh!cs(9(({1|WR%2hz$U1DC6AH!+(uoeEh?eFBeR7$k0WeUSoY1o^thvm(v^$k6 zs3(sAMcq52Lrp-?Z#SV?u+ZA-ZdLTl;8vi-uFQK%j;dRWeASs=AHZrb8) zP>{fSP+Bn>cL;Jm?bySU>9VYC-Jf-g6g98};;!iT#b-r%mGQQTnpo1x5_V%WRc!e1Izn>#)9XoyJ*Q z=dXJX^3(kA1P0vAnI+8rBJ&2^;xSibd7(mi9;5%h3#OiM_K z`uZ_;`#Fq@`Xiz7d4&2fLE@u9t1DU3nn-Bkz+u!E2~BbrlO;s`y_g;eT~m5EW$>_$ zidThwVgEUVdvV|V(9}{TrV1rO?;9X*nHR}pS})pN-6Ems0|zBahJAgQ!D|EIKp$ob zOX|hfED0~++Nd1%_hUBAnG=@tm>ar|Lmt=np}GRwdN02|FYM4%~}fIEvkP5^?_0>cAN!kjEZeKoYN! z>Mdj>8MYKemoyG{xIxNb6lJH-jEw})447ykxO}{4H#SkWpVIuiX){mXA}(&_DW7n> z&EvR0K5^>f4Xe@Waxk!kpo_0IY+=~_l8UX^MwonvcGVSc*j~J0J8$r$Z7)g_r#ndB z1o<*e4anFjW0#EEWPDi0?J|`__-%~2{xar3EVbEu2@!oED_9TgzSWBxWN$^b@&5Qpl?F`xfJ+5qcc2Jy^(Lxd4l>_$t6c3H!rJ9*cS}zxfOn7c_CjE%C(Vaf3j{(q2px$jamW zh2`jImHQR0dhraciQoiI@h7#I`)S!JJnPQ=e87vfhpG+CbKow<@ovn;J&eQsR=1b5<59O6&>!~8#h6FA1$AE%_FMAH+LcMRvb?&th6Pepy$g(~vME-DmC*+}FW ztSP7I?+$dj)#c&<7MQyUqDNGXi!RD4+%JK$N@{eH*xTb;as+#cSig%DuL4~p76m>8 z7M3CKZ`8?1y(@l#q-9u)@;jJ$8M6uiAua?oNWyav@|hMH`>J4*y8>Bi;J9D8{{WjB BteOA- literal 0 HcmV?d00001 diff --git a/target/classes/Service/JsoupUtils.class b/target/classes/Service/JsoupUtils.class new file mode 100644 index 0000000000000000000000000000000000000000..7047a279b640292754f40a98bdee7398cfe645d2 GIT binary patch literal 6769 zcma)A3t&^{b^d`tmTlduTbi!*zVqKJOR`K_ z9{2u_^PhA6^L^+1|J65s^5T~OtW`IM@pCv4#z*n720Vc$8*maIm)5B;eht4K!YAZ? zI)uB!_#{rt<5Ll?{7eHri>JeQ24|)9tbxxNcrJ_V3wO9s9i#$ucZ;Vbg~Y6vgO`)eWmj?DjV2)`$K zzAhGiUs~S?;}7u7FmA&i%Ge)8pzs;#{)vHCLip1#uEV$F@n`aQ)xe*J@E4-;FGKjN z5dJ!h8}Kaye-p+A{H;9b)wkvScLx4GjLrCm5dKj#e@7nQm6_K|g;_PJu-%|*# zPo|U3Mg?EzioFW{zRX}kp=ltQPHZoX9!TVN+Xqq{i40`oc51JkOG@7x^gF}JyuzG; zU5VUSGMYB@C7jKf%!tC=&VfVrm>o^o>7nQ@CznhQ_4bRoy3vfn z{DDkvD0)biM$|K`~ylTp{2j9r3KKVtGE1O2nP$)>J~4YbWBF zw8J(xNO?ZlS5P)vgPFKO=m0(3ZjVw?AeBsya64gd$(~$_0V(8C9BXzxC~#}hfn;7h z3u^nEGY@BSgZX>XI$YGt72JHDjwH3Meg>dAdPlRR2zE@}OTbi;8A``?C5O_sQ^?VyyMMZ>84;hRzEY+Px7ch>*`o&r?G7*1 zI<8gd?yN`%H=n$G=w0FJtxQ8Ql{q8XqF`NdVXdLn4BDWrp_U>}>nS_QB*> z&sZkKBG7%9BPRY2KPHJy`~;Kqg*8Tg@?4tEuLyc?wrudib-iU3m2J$??~pc#cAspE>=#M?Ub{iw_v;DpOf%K7$?~ zwzK3RwK$0*j=M|Mh98@1fm+D4oNjJ&A(^5jvTUiT7OAThqHdhLXlX6O^oIT6HGJf= zcDl!_WS;tV3HI`;W#WIgC)IQ49_jYE1=CcEMI5GDA|^ZJaSh%y)G|{oSDhw~;=QI? zp_&=>l_nm-!>okbYcH!Avn{cxWNFWR$-$(ZjxHrB)O=HQNmN&GqUw-K9rDGqx^ts1gf9`~-qPSgQVWp9p zQevvrBGeRR6uc53c1(4x=Bc_)5L?3nXdlSOhI5I7hFZ%LsrJOH;>eBzhgko)!7CkY z%(n8>v}23vwac2O{bY&lWQhFK#`||n6~fY}PU+plxy)hrxNa&d*NG{N%cRrVr!lHY?}RCnCnZ~`Fw&!esSl_T02TB zsD9y0>Zb%)V|WNTg}i$!;T45vsgJczVL_Ghnrp)xPU{#}*Seq^X(rE4iNQ2P=xLAz zhvyUT8zt2m9(PnOHF9``)jD;(!m>*3E)}Y3uQ$Z2p1Oett)0!Lj`GZ_QhczG))+*$ zc@6sDR)t!-49cjTRk+c!t0ig`cp<4>Pt|Q@#(7)n%cSHomB|sxpp%ipsIaKY$LW&P zLz?Mj3acxfsgbON#|sC!BuBGL_PJGtN=YCFhVA@zmQ3!sbH#pEg|zNe176lh);o4G z%{{bLg3>o^=XNFTDmywX5=v`qpxK=>}zOV8Cq9~cX%**At&Pw z(WV?3gqBgT<%(P7aEY{xjL#nA{V$kGq=%egg_g2C?<&GwkJ+h0V#h(b>GxNfFI`qB zlFV-%%{oVg^EZmsBjl4XYjk8VDHK~!y>hza^SzBS;Fae@?|yw%ji)kFfSPpl^&NP3 z&W>}$pHIG*uhweiQ}mc|&je{+xhz%PpbYg6Q^nK<6TixP20nyi3QNiXNv6j#BZ=sM zzLW88Vjr|6ftcG3qvf8VHgVqrJQxei*>6@N=qYfoHMiR7!BisO;Y#+_0$njuwQx7T zFDk`<_DR>|+pn#}*VEi*r&7C^yS;|Gi61L90hwt(tR-(R$M4+R_;oLT41O5%I?3mD z_M&|AVuHZR^HAs5sKX9EgL<$L?_}F_1#ky;vZt^MyD7!RJ`;ZM7XY=d>jLT~;8U1@ zpRC5$Ppd8NSb%1_iwMU2-4UaRP%K~tBK1XtV?ireM8k_z&sB|exPVy`XsW}P(Ht`_ zqUEmhh{QsrM!jWBV75XmY=tK$<1#@E!RzqZ75m!BlSi z0zLSY)u273H!;&PBMXaI6uG*H_E;mutwxGl=0z-~%_WhgMRdqeN8}nlidZIY&zfm| zd3nCcnpH$+tl4UkSBupgSy9Bwlasrxkkveat~%_q>aCU{R$V~1*Y{h^R?C75=sAPn zc%3>kx!ej(AgcJgi|-8l=L8PFhU0!*&2|&!@TZ4GjBf|Sz6!Ikjw5%V75n)ENF4L< z0bGU0VBvAh$H&lyPhtU{#X?-3tJ>5NPaIRUXhlU62fzr z{hzkXDcg-!V+H#qS%kQ4R$~?WIaI~|rkUm|*k7i-us>q8clRx)w(?Ri!2L^`Swn(22pSs+Q=)2%t?_Ix&JwmS&w82KW5mIx#}r z*&8}B>KTFW=)?$5QQwsk!n;x+UMCX$^_A(yfx`u{NfezwRXJB?GSCr(%JNxS>WkawAu; zM86{vW21}zBfTXZ>tn%-*l?E>^qJ+Zoa~YJX}W)$)mPr9@hScG=>oi&g%`wO0ps4mIIkpf-K-k-BF$$O zIsSc=!T2!4ahw4pC$E|SzV-vEr(y1r}0S&1~rsKQ>xjxfnNdN z4-6Q1u%5v4#|5es#GdyMJ9TBmwyT(bMu@hQ0U?G2B%lHD`>qeR2D>J(RWW|cEPq#P za01)pRG&+rEdMI6U&JlrK^}Rp3KKL$#IlG!-4%#r*n?;hgEdZw_eU6N zdAnz%3(@;n8s1OIaTbRMnD7rVm=7~_j}VB5xc)e2ALs0cID1S(|6zhAnm$fAN)T=5 zsr3j~1*v6&1}i{$7pwp+o|*#dlukdnL7md+SEua@L_T`qLgd#eIadQI@*|S|T0X@w zfv*#RNkU?1;M@a8a5L~IC4Y$STc+@PjM8=ZU~^dR%Wmf3T8@ZDV`b#lBKkkCMas9h zO?q4IEzZ!J1();A{j)8fQ@fmdJFbm9&O!v1$vTq#3H(>EUiGStYCUftzrk_Y@l%xj OO?(<>yw)G%%>M%x;k4@j literal 0 HcmV?d00001