Using Python to develop novel crawler reader based on tkinter

Program name
  • Novel crawling reader
Crawling website
Technology used
  • This program is the extension of GUI visual interface learning of regular, file and python
Program function description
  • This program is developed for me and my professional collaborators to download and display novels without local files. This program can crawl nearly 400000 novels in real time without any local files (as long as the novels exist in the source website) and save each novel in the folder of each novel by chapters in the form of txt file.
Screenshot of program running results

Crawling novel name and web page, creating info file for storage

If it has been downloaded, you don't need to climb again, create a path, and store the novels in the corresponding title folder by chapter. Reading novels

If the file does not exist, access and crawl the chapter and content through the address in the list

Part of program code
from tkinter import *
from tkinter import ttk
import requests
import re
import urllib.request
import os
bt=0
tt=0
def mkdir(path):
    global bt
    # Import module
    # Remove first space
    path = path.strip()
    # Remove tail \ symbol
    path = path.rstrip("\\")

    # Determine whether the path exists
    # True exists
    # No False
    isExists = os.path.exists(path)

    # Judgement result
    if not isExists:
        # Create directory if it does not exist
        # Create directory operation function
        os.makedirs(path)
        print(path + ' Create success')
        bt=1
        return True
    else:
        # Do not create if the directory exists, and prompt that the directory already exists
        print(path + ' directory already exists')
        bt=-1
        return False


def get_web():
    for i in range(1,50):
        url = "http://www.bjkgjlu.com/top/p%d"%i
        response = requests.get(url)
        html = response.text
        web_info = re.findall(r'<table class="table table-striped">(.*?)<div class="page">', html, re.S)[0]
        info = re.findall(r'<a href="(.*?)">(.*?)</a>', web_info, re.S)
        if i ==1:
            fb = open('info.txt', 'w', encoding='utf-8')
            for chapter_info in info:
                chapter_url, chapter_title = chapter_info
                chapter_url = "http://www.bjkgjlu.com%scatalog/" % chapter_url
                fb.write(chapter_url)
                fb.write(chapter_title)
                fb.write("\n")
                print(chapter_title)
                print(chapter_url, end=" ");
            fb.close();
        fb = open('info.txt', 'a+', encoding='utf-8')
        for chapter_info in info:
            chapter_url, chapter_title = chapter_info
            chapter_url = "http://www.bjkgjlu.com%scatalog/" % chapter_url
            fb.write(chapter_url)
            fb.write(chapter_title)
            fb.write("\n")
            # print(chapter_title)
            # print(chapter_url, end=" ");
        fb.close();

    # print(info)
    # print(web_info)

chapter_title_name=[]
def web(st):
    global bt
    global tt
    strs = [];
    strs2 = [];
    strs3 = [];

    with open('info.txt', 'r', encoding="utf-8") as f:
        strs = f.readlines();
    readChinese = re.compile(r'[\u4e00-\u9f5a]+');
    readWeb = re.compile(r'[a-zA-Z:\.\/\d]+');
    for x in strs:
        strs2.append(re.findall(readChinese, x)[0]);
        strs3.append(re.findall(readWeb, x)[0]);
    # print(strs2);
    # print(strs3);
    num = -1;
    name = "";
    if st in strs2:
        num = strs2.index(st);
        name = st;
    else:
        print("No corresponding novel found");
        tt=-1
        return;
    url = strs3[num];
    page = urllib.request.Request(url);
    html = urllib.request.urlopen(page).read().decode("utf-8");
    # url = "http://www.bjkgjlu.com/217667eyj/catalog/"
    # response = requests.get(url)
    # response.encoding = 'utf-8'
    # html = response.text
    title = re.findall(r'<h2>(.*?)</h2>', html)[0]
    print(title)
    mkdir(name)
    if  bt==1:
        # fb = open(str + ".txt", 'w', encoding='utf-8')
        div = re.findall(r'<div class="list">(.*?)<div class="bottom">', html, re.S)[0]
        chapter_info_list = re.findall(r'href="(.*?)">(.*?)<', div);
        i = 0;
        for chapter_info in chapter_info_list:
            chapter_url, chapter_title = chapter_info
            chapter_url = "http://www.bjkgjlu.com%s" % chapter_url
            chapter_page = urllib.request.Request(chapter_url);
            chapter_html = urllib.request.urlopen(chapter_page).read().decode('utf-8');
            chapter_content = \
                re.findall(r'<div class="chapter_content">(.*?)<div class="col-md-45">', chapter_html, re.S)[0]

            chapter_content = chapter_content.replace(' ', '')
            chapter_content = chapter_content.replace('/div', '')
            chapter_content = chapter_content.replace('/a', '')
            chapter_content = chapter_content.replace('p', '')
            chapter_content = chapter_content.replace('<>', '')
            chapter_title_name.append(chapter_title)
            fileName = "./" + name + "/" + "%s" % chapter_title + ".txt";
            print(fileName);
            zjname = "./" + name + "/" + "Chapter list.txt"
            with open(zjname, "w", encoding="utf-8") as file:
                file.write(str(chapter_title_name))
            with open(fileName, "w", encoding="utf-8") as fb:
                fb.write(chapter_title)
                fb.write('\n')
                fb.write(chapter_content)
                fb.write('\n')
            i += 1;
            print(chapter_url)
        print('Download complete')

    elif bt==-1:
        pass

class ReadApp(object):
    def __init__(self):
        self.tk=Tk()
        self.tk.geometry('740x810')
        self.tk.title("Be based on tkinter Novel reader of v2.0")
        self.creat_interface()
        self.tk.mainloop()
    def go(self,*args):
        ZJname=self.comboxlist.get()
        xsname=self.entry.get()
        file=open("%s./"%xsname+"%s.txt"%ZJname,"r",encoding='utf-8')
        content=file.read()
        self.text.delete('1.0','end')
        self.text.insert(INSERT,content)
    def creat_interface(self):
        self.label= Label(self.tk,text="Title:",fg='blue',font=('Microsoft YaHei',25))
        self.label.place(x=30,y=20)
        self.labe2 = Label(self.tk, text="chapter:", fg='red', font=('Microsoft YaHei', 25))
        self.labe2.place(x=30, y=100)
        self.entry= Entry(self.tk, fg='black', font=('Microsoft YaHei', 20))
        self.entry.place(x=180,y=30,width=350)
        list1 = StringVar()
        self.comboxlist = ttk.Combobox(self.tk,    textvariable=list1)  # Initialization
        self.comboxlist["values"] = (1,4,2,5,4)
        # self.comboxlist["state"]="readonly"
        # self.comboxlist.current(0)  # Select first
        self.comboxlist.bind("<<ComboboxSelected>>",self.go)  # Binding event, (when the drop-down list box is selected, bind the go() function)
        self.comboxlist.place(x=180,y=105,height=50,width=380)
        self.text = Text(self.tk, width=75, height=36)
        self.text.place(x=30,y=170)

        # self.text.insert(INSERT, 'Think but not get\n')  # The INSERT index indicates the current position of the cursor
        # self.text.insert(END, 'what's your life')
        self.b = Button(self.tk,text='confirm', font=('KaiTi', 23), bg='pink', fg='green',bd=2, width=6,command=lambda :self.g())
        self.b.place(x=550,y=20)
        mainloop()
    def g(self):
        global tt
        Webname = self.entry.get()
        web(Webname)
        if tt==0:
            self.comboxlist["values"] = ()
            global chapter_title_name
            zjname = "./" + Webname + "/" + "Chapter list.txt"
            file = open(zjname, "r", encoding="utf-8")
            content = file.read()
            chapter_title_name = eval(content)
            self.comboxlist["values"] = chapter_title_name

if __name__ == '__main__':
    # get_web()
    readapp=ReadApp()

Description of some program codes
  • Get web is used to crawl all the novel names and addresses in the source novel website, and store them in the file to crawl all the novel names and href addresses in the website. The program imports the library of requests and gets the source code of the novel ranking website through visiting. Use findall to crawl all the names and websites of the novel. Loop in the file.

  • Web function crawls the chapters and contents of the target novel: through the previous step, we get a txt file that stores all the novel web addresses and names, and use regular processing to divide all the information in the file into names and web addresses, which are stored in three lists. Match the novel name entered by the user with all the names in the above list, and return the corresponding URL of the novel name if any. Thus, the chapters and all contents of the target novel can be crawled through the regular. The crawled novel content is cleaned and stored in the specified file in chapters.

  • tkinter class Init method creates tkinter interface. The Go method create? Interface method is used to add components such as text boxes, input boxes, drop-down boxes, and adjust the size and position.

  • G method is used to click the command executed after confirmation: get the input name, match, add chapter list in the drop-down box, and add novel content in the text box.

Tags: Programming encoding Python Lambda

Posted on Wed, 11 Mar 2020 00:58:01 -0700 by Candrew