提交信息

This commit is contained in:
Your Name 2021-08-27 12:32:11 +08:00
父節點 5f0130b02d
當前提交 ce320f0036
共有 8 個檔案被更改,包括 368 行新增0 行删除

1
Book Submodule

@ -0,0 +1 @@
Subproject commit 5f0130b02d8d9a26532acd6c8dd79b43a957740a

42
pom.xml Normal file
查看文件

@ -0,0 +1,42 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>FindBook</artifactId>
<version>1.0-SNAPSHOT</version>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp -->
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.9.1</version>
</dependency>
</dependencies>
</project>

9
src/main/java/Main.java Normal file
查看文件

@ -0,0 +1,9 @@
import Service.JsoupUtils;
import java.io.IOException;
public class Main {
public static void main(String[] args) throws IOException {
new JsoupUtils().getBook(1,3,"https://book.qidian.com/info/1029575290/#Catalog");
}
}

查看文件

@ -0,0 +1,165 @@
package Service;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
public class HTMLParseUtils implements PageProcessor {
private Site site = Site.me()
.setCharset("utf-8")
.setTimeOut(1000)
.setSleepTime(1000);
String bookName1 = "";
@Override
public Site getSite() {return site;}
//爬取数据逻辑
//第一级URL https://www.qidian.com/xuanhuan 获取书栏目录
//第二级 https://book.qidian.com/info/1019251979#Catalog 章节目录
//第三级 https://read.qidian.com/chapter/SaT8jsiJD54smgY_yC2imA2/oQbX6YtwB_NOBDFlr9quQA2 章节内容
@Override
public void process(Page page) {
//获取URL
Selectable table = page.getUrl();
//URL匹配 .{23}去代替字符匹配每个章节的后缀不一样
if (table.regex("https://read.qidian.com/chapter/.{23}/.{23}").match()) {//文章章节页面
Html html = page.getHtml();
String title = "";
List<String> content = new ArrayList<String>();
//判断是否是第一章
if (html.xpath("/html/body/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/h1/text()").toString() != null) {//是第一章
//获取书名
bookName1 = html.xpath("/html/body/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/h1/text()").toString();
//System.out.println(bookName);
//获取章节名
title = html.xpath("[@class='main-text-wrap']/div[1]/h3/span/text()").toString();
//System.out.println(title);
//获取文章内容
content = html.xpath("[@class='main-text-wrap']/div[2]/p/text()").all();
} else {//不是第一章
//获取章节名
title = html.xpath("[@id='j_chapterBox']/div[1]/div[1]/div[1]/h3/span/text()").toString();
//获取文章内容
content = html.xpath("[@id='j_chapterBox']/div[1]/div[1]/div[2]/p/text()").all();
}
//存到本地
downBook(bookName1, title, content);
}else if(table.regex("https://book.qidian.com/info/\\d{10}#Catalog").match()){//书的章节目录
//获取每一章节的地址,在章节目录里每一章的xpath
List<String> url = page.getHtml().xpath("[@class='volume-wrap']/div[1]/ul/li/a/@href").all();
//加入待爬取序列
page.addTargetRequests(url);
}else{//一级url
Html html = page.getHtml();
List<String> url = html.xpath("[@id='new-book-list']/div/ul/li/div[2]/h4/a/@href").all();
List<String> url2 = new ArrayList<String>();
for (String string : url) {
url2.add(string + "#Catalog");
}
//加入待爬取序列
page.addTargetRequests(url2);
}
}
//将书存入本地
private void downBook(String bookName2, String title, List<String> content) {
//判断目录存不存在
File file = new File("D:/book.qidian/" + bookName2);
if(!file.exists()){
file.mkdirs();
}
PrintWriter pw = null; //使用IO流
try {
FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + bookName2 + "/" + title + ".txt");
pw = new PrintWriter(fos,true);
for (String string : content) {
pw.println(string);
}
System.out.println(title + " " + "爬取完毕");
} catch (FileNotFoundException e) {
e.printStackTrace();
} finally {//关流
pw.close();
}
}
}

查看文件

@ -0,0 +1,151 @@
package Service;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
public class JsoupUtils {
public void getBook(String url, int section) throws IOException {
if (Jsoup.connect(url).execute().statusCode() == 200) {
Document doc = Jsoup.connect(url).get();
String bookName = doc.title();
Elements links = doc.select("#j-catalogWrap > div.volume-wrap");
Elements contentUrl = links.select("a");
List<String> urls = contentUrl.stream().map(element -> element.attr("abs:href")).collect(Collectors.toList());
List<String> titles = new ArrayList<>();
List<Integer> words_number = new ArrayList<>();
for (String a : urls) {
if (a.contains("vipreader")) {
System.out.println("剩下的是vip章节了");
break;
} else if (a.contains("chapter")) {
Document mo = Jsoup.connect(a).get();
String id = mo.getElementsByClass("text-wrap").attr("id");
System.out.println(id);
Elements title = mo.select("#" + id + " > div > div.text-head > h3 > span.content-wrap");
Elements content = mo.select("#" + id + " > div > div.read-content.j_readContent");
String tit = title.text();
String con = content.text();
if (!tit.equals("")) {
titles.add(tit);
words_number.add(con.length());
} else {
continue;
}
downBook(bookName, tit, con, section);
} else {
section++;
}
}
if (!titles.isEmpty()) {
conclusion(bookName,titles,words_number);
}
} else {
return;
}
}
//获得从x卷到y卷
public void getBook(int start, int end, String url) throws IOException {
Document doc = Jsoup.connect(url).get();
String bookName = doc.title();
Elements links = doc.select("#j-catalogWrap > div.volume-wrap");
Elements contentUrl = links.select("a");
List<String> urls = contentUrl.stream().map(element -> element.attr("abs:href")).collect(Collectors.toList());
List<String> titles = new ArrayList<>();
List<Integer> words_number = new ArrayList<>();
for (String a : urls) {
if (a.contains("chapter")) {
Document mo = Jsoup.connect(a).get();
String id = mo.getElementsByClass("text-wrap").attr("id");
System.out.println(id);
Elements title = mo.select("#" + id + " > div > div.text-head > h3 > span.content-wrap");
Elements content = mo.select("#" + id + " > div > div.read-content.j_readContent");
String tit = title.text();
String con = content.text();
if (!tit.equals("")) {
titles.add(tit);
words_number.add(con.length());
} else {
continue;
}
downBook(bookName, tit, con, start - 1);
} else {
start++;
if (start == end) {
conclusion(bookName,titles,words_number);
return;
}
}
}
}
private static void downBook(String bookName2, String title, String content, int section) {
File file = new File("D:/book.qidian/" + "#" + bookName2);
if (!file.exists()) {
//如果不存在目录则创建目录
file.mkdirs();
}
PrintWriter pw = null;
try {
FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + "#" + bookName2 + "/" + "" + section + "" + title + ".txt");
pw = new PrintWriter(fos, true);
pw.println(content);
System.out.println(title + " " + "爬取完毕");
} catch (FileNotFoundException e) {
e.printStackTrace();
return;
} finally {//关流
pw.close();
}
}
private static void conclusion(String bookName2, List<String> titles, List<Integer> words_number) {
File file = new File("D:/book.qidian/" + "#" + bookName2);
PrintWriter pw = null;
try {
FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + "#" + bookName2 + "/" + "conclusion" + ".txt");
pw = new PrintWriter(fos, true);
for (int i = 0; i < titles.size(); i++) {
pw.println(titles.get(i));
pw.println(words_number.get(i));
}
} catch (FileNotFoundException e) {
e.printStackTrace();
return;
} finally {//关流
pw.close();
}
}
}

二進制
target/classes/Main.class Normal file

未顯示二進位檔案。

未顯示二進位檔案。

未顯示二進位檔案。