提交信息
This commit is contained in:
父節點
5f0130b02d
當前提交
ce320f0036
1
Book
Submodule
1
Book
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 5f0130b02d8d9a26532acd6c8dd79b43a957740a
|
42
pom.xml
Normal file
42
pom.xml
Normal file
@ -0,0 +1,42 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<groupId>org.example</groupId>
|
||||
<artifactId>FindBook</artifactId>
|
||||
<version>1.0-SNAPSHOT</version>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>8</source>
|
||||
<target>8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-core</artifactId>
|
||||
<version>0.7.3</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>us.codecraft</groupId>
|
||||
<artifactId>webmagic-extension</artifactId>
|
||||
<version>0.7.3</version>
|
||||
</dependency>
|
||||
<!-- https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp -->
|
||||
<dependency>
|
||||
<groupId>com.squareup.okhttp3</groupId>
|
||||
<artifactId>okhttp</artifactId>
|
||||
<version>4.9.1</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</project>
|
9
src/main/java/Main.java
Normal file
9
src/main/java/Main.java
Normal file
@ -0,0 +1,9 @@
|
||||
import Service.JsoupUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class Main {
|
||||
public static void main(String[] args) throws IOException {
|
||||
new JsoupUtils().getBook(1,3,"https://book.qidian.com/info/1029575290/#Catalog");
|
||||
}
|
||||
}
|
165
src/main/java/Service/HTMLParseUtils.java
Normal file
165
src/main/java/Service/HTMLParseUtils.java
Normal file
@ -0,0 +1,165 @@
|
||||
package Service;
|
||||
|
||||
import us.codecraft.webmagic.Page;
|
||||
import us.codecraft.webmagic.Site;
|
||||
import us.codecraft.webmagic.processor.PageProcessor;
|
||||
import us.codecraft.webmagic.selector.Html;
|
||||
import us.codecraft.webmagic.selector.Selectable;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class HTMLParseUtils implements PageProcessor {
|
||||
|
||||
private Site site = Site.me()
|
||||
|
||||
.setCharset("utf-8")
|
||||
|
||||
.setTimeOut(1000)
|
||||
|
||||
.setSleepTime(1000);
|
||||
|
||||
String bookName1 = "";
|
||||
|
||||
|
||||
@Override
|
||||
|
||||
public Site getSite() {return site;}
|
||||
|
||||
//爬取数据逻辑
|
||||
|
||||
//第一级URL https://www.qidian.com/xuanhuan 获取书栏目录
|
||||
|
||||
//第二级 https://book.qidian.com/info/1019251979#Catalog 章节目录
|
||||
|
||||
//第三级 https://read.qidian.com/chapter/SaT8jsiJD54smgY_yC2imA2/oQbX6YtwB_NOBDFlr9quQA2 章节内容
|
||||
|
||||
@Override
|
||||
|
||||
public void process(Page page) {
|
||||
|
||||
//获取URL
|
||||
|
||||
Selectable table = page.getUrl();
|
||||
|
||||
//URL匹配 用.{23}去代替字符匹配,每个章节的后缀不一样
|
||||
|
||||
if (table.regex("https://read.qidian.com/chapter/.{23}/.{23}").match()) {//文章章节页面
|
||||
|
||||
Html html = page.getHtml();
|
||||
String title = "";
|
||||
List<String> content = new ArrayList<String>();
|
||||
//判断是否是第一章
|
||||
|
||||
if (html.xpath("/html/body/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/h1/text()").toString() != null) {//是第一章
|
||||
|
||||
//获取书名
|
||||
|
||||
bookName1 = html.xpath("/html/body/div[2]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/h1/text()").toString();
|
||||
|
||||
//System.out.println(bookName);
|
||||
|
||||
//获取章节名
|
||||
|
||||
title = html.xpath("[@class='main-text-wrap']/div[1]/h3/span/text()").toString();
|
||||
|
||||
//System.out.println(title);
|
||||
|
||||
//获取文章内容
|
||||
|
||||
content = html.xpath("[@class='main-text-wrap']/div[2]/p/text()").all();
|
||||
|
||||
} else {//不是第一章
|
||||
|
||||
//获取章节名
|
||||
|
||||
title = html.xpath("[@id='j_chapterBox']/div[1]/div[1]/div[1]/h3/span/text()").toString();
|
||||
|
||||
//获取文章内容
|
||||
|
||||
content = html.xpath("[@id='j_chapterBox']/div[1]/div[1]/div[2]/p/text()").all();
|
||||
|
||||
}
|
||||
|
||||
//存到本地
|
||||
|
||||
downBook(bookName1, title, content);
|
||||
|
||||
}else if(table.regex("https://book.qidian.com/info/\\d{10}#Catalog").match()){//书的章节目录
|
||||
|
||||
//获取每一章节的地址,在章节目录里每一章的xpath
|
||||
|
||||
List<String> url = page.getHtml().xpath("[@class='volume-wrap']/div[1]/ul/li/a/@href").all();
|
||||
|
||||
//加入待爬取序列
|
||||
|
||||
page.addTargetRequests(url);
|
||||
|
||||
}else{//一级url
|
||||
|
||||
Html html = page.getHtml();
|
||||
|
||||
List<String> url = html.xpath("[@id='new-book-list']/div/ul/li/div[2]/h4/a/@href").all();
|
||||
|
||||
List<String> url2 = new ArrayList<String>();
|
||||
|
||||
for (String string : url) {
|
||||
|
||||
url2.add(string + "#Catalog");
|
||||
|
||||
}
|
||||
|
||||
//加入待爬取序列
|
||||
|
||||
page.addTargetRequests(url2);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//将书存入本地
|
||||
|
||||
private void downBook(String bookName2, String title, List<String> content) {
|
||||
|
||||
//判断目录存不存在
|
||||
|
||||
File file = new File("D:/book.qidian/" + bookName2);
|
||||
|
||||
if(!file.exists()){
|
||||
|
||||
file.mkdirs();
|
||||
|
||||
}
|
||||
|
||||
PrintWriter pw = null; //使用IO流
|
||||
|
||||
try {
|
||||
|
||||
FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + bookName2 + "/" + title + ".txt");
|
||||
|
||||
pw = new PrintWriter(fos,true);
|
||||
|
||||
for (String string : content) {
|
||||
|
||||
pw.println(string);
|
||||
|
||||
}
|
||||
|
||||
System.out.println(title + " " + "爬取完毕");
|
||||
|
||||
} catch (FileNotFoundException e) {
|
||||
|
||||
e.printStackTrace();
|
||||
|
||||
} finally {//关流
|
||||
|
||||
pw.close();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
151
src/main/java/Service/JsoupUtils.java
Normal file
151
src/main/java/Service/JsoupUtils.java
Normal file
@ -0,0 +1,151 @@
|
||||
package Service;
|
||||
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class JsoupUtils {
|
||||
public void getBook(String url, int section) throws IOException {
|
||||
if (Jsoup.connect(url).execute().statusCode() == 200) {
|
||||
Document doc = Jsoup.connect(url).get();
|
||||
String bookName = doc.title();
|
||||
Elements links = doc.select("#j-catalogWrap > div.volume-wrap");
|
||||
Elements contentUrl = links.select("a");
|
||||
List<String> urls = contentUrl.stream().map(element -> element.attr("abs:href")).collect(Collectors.toList());
|
||||
List<String> titles = new ArrayList<>();
|
||||
List<Integer> words_number = new ArrayList<>();
|
||||
for (String a : urls) {
|
||||
if (a.contains("vipreader")) {
|
||||
System.out.println("剩下的是vip章节了");
|
||||
break;
|
||||
} else if (a.contains("chapter")) {
|
||||
Document mo = Jsoup.connect(a).get();
|
||||
String id = mo.getElementsByClass("text-wrap").attr("id");
|
||||
System.out.println(id);
|
||||
Elements title = mo.select("#" + id + " > div > div.text-head > h3 > span.content-wrap");
|
||||
Elements content = mo.select("#" + id + " > div > div.read-content.j_readContent");
|
||||
String tit = title.text();
|
||||
String con = content.text();
|
||||
if (!tit.equals("")) {
|
||||
titles.add(tit);
|
||||
words_number.add(con.length());
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
downBook(bookName, tit, con, section);
|
||||
|
||||
} else {
|
||||
section++;
|
||||
}
|
||||
|
||||
}
|
||||
if (!titles.isEmpty()) {
|
||||
conclusion(bookName,titles,words_number);
|
||||
}
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//获得从x卷到y卷
|
||||
public void getBook(int start, int end, String url) throws IOException {
|
||||
Document doc = Jsoup.connect(url).get();
|
||||
String bookName = doc.title();
|
||||
Elements links = doc.select("#j-catalogWrap > div.volume-wrap");
|
||||
Elements contentUrl = links.select("a");
|
||||
List<String> urls = contentUrl.stream().map(element -> element.attr("abs:href")).collect(Collectors.toList());
|
||||
List<String> titles = new ArrayList<>();
|
||||
List<Integer> words_number = new ArrayList<>();
|
||||
for (String a : urls) {
|
||||
if (a.contains("chapter")) {
|
||||
Document mo = Jsoup.connect(a).get();
|
||||
String id = mo.getElementsByClass("text-wrap").attr("id");
|
||||
System.out.println(id);
|
||||
Elements title = mo.select("#" + id + " > div > div.text-head > h3 > span.content-wrap");
|
||||
Elements content = mo.select("#" + id + " > div > div.read-content.j_readContent");
|
||||
String tit = title.text();
|
||||
String con = content.text();
|
||||
if (!tit.equals("")) {
|
||||
titles.add(tit);
|
||||
words_number.add(con.length());
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
downBook(bookName, tit, con, start - 1);
|
||||
|
||||
} else {
|
||||
start++;
|
||||
if (start == end) {
|
||||
conclusion(bookName,titles,words_number);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private static void downBook(String bookName2, String title, String content, int section) {
|
||||
|
||||
File file = new File("D:/book.qidian/" + "#" + bookName2);
|
||||
|
||||
if (!file.exists()) {
|
||||
|
||||
//如果不存在目录,则创建目录
|
||||
file.mkdirs();
|
||||
}
|
||||
|
||||
PrintWriter pw = null;
|
||||
try {
|
||||
|
||||
FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + "#" + bookName2 + "/" + "第" + section + "卷" + title + ".txt");
|
||||
|
||||
pw = new PrintWriter(fos, true);
|
||||
|
||||
pw.println(content);
|
||||
|
||||
System.out.println(title + " " + "爬取完毕");
|
||||
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
return;
|
||||
|
||||
} finally {//关流
|
||||
pw.close();
|
||||
}
|
||||
}
|
||||
|
||||
private static void conclusion(String bookName2, List<String> titles, List<Integer> words_number) {
|
||||
|
||||
File file = new File("D:/book.qidian/" + "#" + bookName2);
|
||||
|
||||
PrintWriter pw = null;
|
||||
try {
|
||||
|
||||
FileOutputStream fos = new FileOutputStream("D:/book.qidian/" + "#" + bookName2 + "/" + "conclusion" + ".txt");
|
||||
|
||||
pw = new PrintWriter(fos, true);
|
||||
|
||||
for (int i = 0; i < titles.size(); i++) {
|
||||
pw.println(titles.get(i));
|
||||
pw.println(words_number.get(i));
|
||||
}
|
||||
|
||||
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
return;
|
||||
|
||||
} finally {//关流
|
||||
pw.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
二進制
target/classes/Main.class
Normal file
二進制
target/classes/Main.class
Normal file
未顯示二進位檔案。
二進制
target/classes/Service/HTMLParseUtils.class
Normal file
二進制
target/classes/Service/HTMLParseUtils.class
Normal file
未顯示二進位檔案。
二進制
target/classes/Service/JsoupUtils.class
Normal file
二進制
target/classes/Service/JsoupUtils.class
Normal file
未顯示二進位檔案。
載入中…
x
新增問題並參考
Block a user