Cool
Cool
Published on 2021-09-22 / 19 Visits
0
0

小说爬取 代码片段

main

package com.liang;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.junit.Test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;

public class GetJSDocument1 {

    public static void main(String[] args) throws IOException, InterruptedException {
        GetJSDocument1 getJSDocument = new GetJSDocument1();

        try {
            getJSDocument.getDoc();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }


    }


    @Test
    public void getDoc() throws InterruptedException, IOException {

        // String url = "https://blog.cool88.top";
        // 屏蔽HtmlUnit等系统 log
        // LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log","org.apache.commons.logging.impl.NoOpLog");
        java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
        java.util.logging.Logger.getLogger("org.apache.http.client").setLevel(Level.OFF);
        // HtmlUnit 模拟浏览器
        WebClient webClient = new WebClient(BrowserVersion.CHROME);
        webClient.getOptions().setJavaScriptEnabled(true); // 启用JS解释器,默认为true
        webClient.getOptions().setCssEnabled(false); // 禁用css支持
        webClient.getOptions().setThrowExceptionOnScriptError(false); // js运行错误时,是否抛出异常
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        // 支持ajax
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());
        webClient.getOptions().setTimeout(10 * 1000); // 设置连接超时时间

        for (int i = 7666; i < 9132; i++) {
            String url = "http://www.xiaonongminxs.com/chongfantian/" + i + ".html";
            url = url.trim();
            System.out.println(url);
            try {
                extracted(url, webClient);
            } catch (Exception e) {
                i--;
                e.printStackTrace();
            }

        }




/*

        Elements clearfix = document.select(".clearfix"); //章节标题
        Elements h4 = clearfix.select("h4"); //章节标题

        Elements novelcontent= document.select("#partContent");//文章内容
        Elements ps = novelcontent.select("p");// 一个段落
        //拼接一个章节
        StringBuilder stringBuilder=new StringBuilder();
        for (Element element : h4) {
            System.out.println(element.text());

            stringBuilder.append(element.text());
            stringBuilder.append("\r\n");
            stringBuilder.append("\r\n");
        }
        for (Element p : ps) {
            System.out.println(p.text());


            stringBuilder.append(p.text());
            stringBuilder.append("\r\n");
            stringBuilder.append("\r\n");
        }

        NovelWrite novelWrite=new NovelWrite();
        novelWrite.writenovel(stringBuilder);
*/



      /*  for (Element picture : pictures) {
            //
            String imgsrc = picture.attr("src");
            System.out.println(imgsrc);
            Download download = new Download();
            download.downloadPicture("https://pic.netbian.com" + imgsrc);
        }
*/
    /*  for (int i = 0; i < 50; i++) {


             try {
                 //取得  加载更多   的按钮
                 HtmlAnchor alink = htmlPage.getAnchorByText("加载更多");
                 //模拟点击
                 htmlPage  = alink.click();
                 //等待js执行10秒
             } catch (ElementNotFoundException e) {
                 e.printStackTrace();
             } catch (IOException e) {
                 e.printStackTrace();
             }
             webClient.waitForBackgroundJavaScript(10 * 1000); // 等待js后台执行30秒


    }*/

        //  System.out.println(document);

    /* Element teamList = document.getElementById("team_list");
    Elements pictures = teamList.select("img[src]");

    for (Element pic : pictures) {
        System.out.println(pic.attr("src"));
        Download.downloadPicture(pictureUrl+ (String)pic.attr("src"));
    }*/

        // download.downloadPicture("http://cloud.liangblog.xyz/data/User/admin/home/img2/300.jpg");
    }

    private void extracted(String url, WebClient webClient) throws IOException, InterruptedException {
        HtmlPage htmlPage = webClient.getPage(url);
        Thread.sleep(1000);
        webClient.waitForBackgroundJavaScript(3 * 1000); // 等待js后台执行30秒

        // 解析网页
        String pageAsXml = htmlPage.asXml();
        //System.out.println(pageAsXml);
        Document document = Jsoup.parse(pageAsXml);
        // String texttitel = document.select(".chaptertitle.clearfix").text();
        StringBuilder stringBuilder = new StringBuilder();
        // stringBuilder.append(texttitel);
        stringBuilder.append("\r\n");
        String textcontext = document.select(".panel-body").html();
        Elements mtitle = document.select(".m-title");

        String h1 = mtitle.select("h1").text();

        stringBuilder.append(h1);
        stringBuilder.append("\r\n");
        //换行问题
        String s = textcontext.replaceAll("<br>", "\r\n");
        s = s.replaceAll("&nbsp;", "");

        s = s.substring(s.lastIndexOf("</div"), s.length());
        s = s.replaceAll("</div", "");
        // System.out.println(s);

        stringBuilder.append(s);
        System.out.println(stringBuilder.toString());

        NovelWrite novelWrite = new NovelWrite();
        novelWrite.writenovel(stringBuilder);
    }
}

write

package com.liang;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;

public class NovelWrite {
    public  void  writenovel(StringBuilder stringBuilder){
        File file =new File("C:\\Users\\Fangliang\\Desktop\\a\\123.txt");
        try {
            FileWriter fileWriter=new FileWriter(file,true);
            BufferedWriter bufferedWriter=new BufferedWriter(fileWriter);
            bufferedWriter.write(stringBuilder.toString());
            bufferedWriter.close();
            fileWriter.close();
        } catch (Exception e) {
            e.printStackTrace();
        }


    }

}


Comment