diff --git a/Book b/Book new file mode 160000 index 0000000..5f0130b --- /dev/null +++ b/Book @@ -0,0 +1 @@ +Subproject commit 5f0130b02d8d9a26532acd6c8dd79b43a957740a diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..f662418 --- /dev/null +++ b/pom.xml @@ -0,0 +1,42 @@ + + + 4.0.0 + + org.example + FindBook + 1.0-SNAPSHOT + + + + org.apache.maven.plugins + maven-compiler-plugin + + 8 + 8 + + + + + + + us.codecraft + webmagic-core + 0.7.3 + + + + us.codecraft + webmagic-extension + 0.7.3 + + + + com.squareup.okhttp3 + okhttp + 4.9.1 + + + + \ No newline at end of file diff --git a/src/main/java/Main.java b/src/main/java/Main.java new file mode 100644 index 0000000..28a556e --- /dev/null +++ b/src/main/java/Main.java @@ -0,0 +1,9 @@ +import Service.JsoupUtils; + +import java.io.IOException; + +public class Main { + public static void main(String[] args) throws IOException { + new JsoupUtils().getBook(1,3,"https://book.qidian.com/info/1029575290/#Catalog"); + } +} diff --git a/src/main/java/Service/HTMLParseUtils.java b/src/main/java/Service/HTMLParseUtils.java new file mode 100644 index 0000000..2914ea3 --- /dev/null +++ b/src/main/java/Service/HTMLParseUtils.java @@ -0,0 +1,165 @@ +package Service; + +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.Selectable; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.List; + +public class HTMLParseUtils implements PageProcessor { + + private Site site = Site.me() + + .setCharset("utf-8") + + .setTimeOut(1000) + + .setSleepTime(1000); + + String bookName1 = ""; + + + @Override + + public Site getSite() {return site;} + + //爬取数据逻辑 + + //第一级URL https://www.qidian.com/xuanhuan 获取书栏目录 + + //第二级 https://book.qidian.com/info/1019251979#Catalog 章节目录 + + //第三级 https://read.qidian.com/chapter/SaT8jsiJD54smgY_yC2imA2/oQbX6YtwB_NOBDFlr9quQA2 章节内容 + + @Override + + public void process(Page page) { + + //获取URL + + Selectable table = page.getUrl(); + + //URL匹配 用.{23}去代替字符匹配,每个章节的后缀不一样 + + if (table.regex("https://read.qidian.com/chapter/.{23}/.{23}").match()) {//文章章节页面 + + Html html = page.getHtml(); + String title = ""; + List content = new ArrayList(); + //判断是否是第一章 + + if (html.xpath("/html/body/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/h1/text()").toString() != null) {//是第一章 + + //获取书名 + + bookName1 = html.xpath("/html/body/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/h1/text()").toString(); + + //System.out.println(bookName); + + //获取章节名 + + title = html.xpath("[@class='main-text-wrap']/div[1]/h3/span/text()").toString(); + + //System.out.println(title); + + //获取文章内容 + + content = html.xpath("[@class='main-text-wrap']/div[2]/p/text()").all(); + + } else {//不是第一章 + + //获取章节名 + + title = html.xpath("[@id='j_chapterBox']/div[1]/div[1]/div[1]/h3/span/text()").toString(); + + //获取文章内容 + + content = html.xpath("[@id='j_chapterBox']/div[1]/div[1]/div[2]/p/text()").all(); + + } + + //存到本地 + + downBook(bookName1, title, content); + + }else if(table.regex("https://book.qidian.com/info/\\d{10}#Catalog").match()){//书的章节目录 + + //获取每一章节的地址,在章节目录里每一章的xpath + + List url = page.getHtml().xpath("[@class='volume-wrap']/div[1]/ul/li/a/@href").all(); + + //加入待爬取序列 + + page.addTargetRequests(url); + + }else{//一级url + + Html html = page.getHtml(); + + List url = html.xpath("[@id='new-book-list']/div/ul/li/div[2]/h4/a/@href").all(); + + List url2 = new ArrayList(); + + for (String string : url) { + + url2.add(string + "#Catalog"); + + } + + //加入待爬取序列 + + page.addTargetRequests(url2); + + } + + } + + //将书存入本地 + + private void downBook(String bookName2, String title, List content) { + + //判断目录存不存在 + + File file = new File("D:/book.qidian/" + bookName2); + + if(!file.exists()){ + + file.mkdirs(); + + } + + PrintWriter pw = null; //使用IO流 + + try { + + FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + bookName2 + "/" + title + ".txt"); + + pw = new PrintWriter(fos,true); + + for (String string : content) { + + pw.println(string); + + } + + System.out.println(title + " " + "爬取完毕"); + + } catch (FileNotFoundException e) { + + e.printStackTrace(); + + } finally {//关流 + + pw.close(); + + } + + } +} \ No newline at end of file diff --git a/src/main/java/Service/JsoupUtils.java b/src/main/java/Service/JsoupUtils.java new file mode 100644 index 0000000..f9f5742 --- /dev/null +++ b/src/main/java/Service/JsoupUtils.java @@ -0,0 +1,151 @@ +package Service; + + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; + +import java.io.*; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +public class JsoupUtils { + public void getBook(String url, int section) throws IOException { + if (Jsoup.connect(url).execute().statusCode() == 200) { + Document doc = Jsoup.connect(url).get(); + String bookName = doc.title(); + Elements links = doc.select("#j-catalogWrap > div.volume-wrap"); + Elements contentUrl = links.select("a"); + List urls = contentUrl.stream().map(element -> element.attr("abs:href")).collect(Collectors.toList()); + List titles = new ArrayList<>(); + List words_number = new ArrayList<>(); + for (String a : urls) { + if (a.contains("vipreader")) { + System.out.println("剩下的是vip章节了"); + break; + } else if (a.contains("chapter")) { + Document mo = Jsoup.connect(a).get(); + String id = mo.getElementsByClass("text-wrap").attr("id"); + System.out.println(id); + Elements title = mo.select("#" + id + " > div > div.text-head > h3 > span.content-wrap"); + Elements content = mo.select("#" + id + " > div > div.read-content.j_readContent"); + String tit = title.text(); + String con = content.text(); + if (!tit.equals("")) { + titles.add(tit); + words_number.add(con.length()); + } else { + continue; + } + downBook(bookName, tit, con, section); + + } else { + section++; + } + + } + if (!titles.isEmpty()) { + conclusion(bookName,titles,words_number); + } + } else { + return; + } + + } + + //获得从x卷到y卷 + public void getBook(int start, int end, String url) throws IOException { + Document doc = Jsoup.connect(url).get(); + String bookName = doc.title(); + Elements links = doc.select("#j-catalogWrap > div.volume-wrap"); + Elements contentUrl = links.select("a"); + List urls = contentUrl.stream().map(element -> element.attr("abs:href")).collect(Collectors.toList()); + List titles = new ArrayList<>(); + List words_number = new ArrayList<>(); + for (String a : urls) { + if (a.contains("chapter")) { + Document mo = Jsoup.connect(a).get(); + String id = mo.getElementsByClass("text-wrap").attr("id"); + System.out.println(id); + Elements title = mo.select("#" + id + " > div > div.text-head > h3 > span.content-wrap"); + Elements content = mo.select("#" + id + " > div > div.read-content.j_readContent"); + String tit = title.text(); + String con = content.text(); + if (!tit.equals("")) { + titles.add(tit); + words_number.add(con.length()); + } else { + continue; + } + + downBook(bookName, tit, con, start - 1); + + } else { + start++; + if (start == end) { + conclusion(bookName,titles,words_number); + return; + } + } + + } + } + + private static void downBook(String bookName2, String title, String content, int section) { + + File file = new File("D:/book.qidian/" + "#" + bookName2); + + if (!file.exists()) { + + //如果不存在目录,则创建目录 + file.mkdirs(); + } + + PrintWriter pw = null; + try { + + FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + "#" + bookName2 + "/" + "第" + section + "卷" + title + ".txt"); + + pw = new PrintWriter(fos, true); + + pw.println(content); + + System.out.println(title + " " + "爬取完毕"); + + } catch (FileNotFoundException e) { + e.printStackTrace(); + return; + + } finally {//关流 + pw.close(); + } + } + + private static void conclusion(String bookName2, List titles, List words_number) { + + File file = new File("D:/book.qidian/" + "#" + bookName2); + + PrintWriter pw = null; + try { + + FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + "#" + bookName2 + "/" + "conclusion" + ".txt"); + + pw = new PrintWriter(fos, true); + + for (int i = 0; i < titles.size(); i++) { + pw.println(titles.get(i)); + pw.println(words_number.get(i)); + } + + + } catch (FileNotFoundException e) { + e.printStackTrace(); + return; + + } finally {//关流 + pw.close(); + } + } +} + diff --git a/target/classes/Main.class b/target/classes/Main.class new file mode 100644 index 0000000..2aa1402 Binary files /dev/null and b/target/classes/Main.class differ diff --git a/target/classes/Service/HTMLParseUtils.class b/target/classes/Service/HTMLParseUtils.class new file mode 100644 index 0000000..ccbc126 Binary files /dev/null and b/target/classes/Service/HTMLParseUtils.class differ diff --git a/target/classes/Service/JsoupUtils.class b/target/classes/Service/JsoupUtils.class new file mode 100644 index 0000000..7047a27 Binary files /dev/null and b/target/classes/Service/JsoupUtils.class differ