Crawler training project: get the movie with the highest score of Douban and download it

Previous review

In the last blog, we learned four major libraries of Python crawlers: urlib, requests, beautiful soup and selenium Introduction to reptile common library

  • Learn the common usage of urlib and request
  • I learned how to use beautiful soup to parse web pages and selenium to drive browsers

# We imported the web driver module
from selenium import webdriver
# And then we created a Chrome driver
driver = webdriver.Chrome()
# Then use get method to open Baidu
driver.get("https://www.baidu.com")
# Get the input box and write in the content we want to search
input = driver.find_element_by_css_selector('#kw')
input.send_keys("Bordeno's photos")
# We get the search button and click
button = driver.find_element_by_css_selector('#su')
button.click()

This is the code of the last time I checked the picture of Mr. Bordeaux. The effect is as follows

Capture Douban film and save local

Let's grab the top 250 movies on Douban


import requests
from bs4 import BeautifulSoup
import xlwt
//Jiaqun: 45692667, get more learning materials, hand training programs and learning atmosphere

def request_douban(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        return None


book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet = book.add_sheet('Watercress movie Top250', cell_overwrite_ok=True)
sheet.write(0, 0, 'Name')
sheet.write(0, 1, 'picture')
sheet.write(0, 2, 'ranking')
sheet.write(0, 3, 'score')
sheet.write(0, 4, 'author')
sheet.write(0, 5, 'brief introduction')

n = 1


def save_to_excel(soup):
    list = soup.find(class_='grid_view').find_all('li')

    for item in list:
        item_name = item.find(class_='title').string
        item_img = item.find('a').find('img').get('src')
        item_index = item.find(class_='').string
        item_score = item.find(class_='rating_num').string
        item_author = item.find('p').text
        if (item.find(class_='inq') != None):
            item_intr = item.find(class_='inq').string

        # print('crawling movie: '+ item_index +' | + item_name + '| + item_img +' | + item_score + '| + item_author +' | + item_intr)
        print('Crawling movie:' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intr)

        global n

        sheet.write(n, 0, item_name)
        sheet.write(n, 1, item_img)
        sheet.write(n, 2, item_index)
        sheet.write(n, 3, item_score)
        sheet.write(n, 4, item_author)
        sheet.write(n, 5, item_intr)

        n = n + 1


def main(page):
    url = 'https://movie.douban.com/top250?start=' + str(page * 25) + '&filter='
    html = request_douban(url)
    soup = BeautifulSoup(html, 'lxml')
    save_to_excel(soup)


if __name__ == '__main__':

    for i in range(0, 10):
        main(i)

book.save(u'Douban's most popular 250 films.csv')

code analysis

Import related libraries first

import requests
# Request Web Library
from bs4 import BeautifulSoup
# Parse web page Library
import xlwt
# Interact with Excel file

Define a function to request a web page

def request_douban(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        return None

Create an Excel to store data

book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet = book.add_sheet('Watercress movie Top250', cell_overwrite_ok=True)
sheet.write(0, 0, 'Name')
sheet.write(0, 1, 'picture')
sheet.write(0, 2, 'ranking')
sheet.write(0, 3, 'score')
sheet.write(0, 4, 'author')
sheet.write(0, 5, 'brief introduction')

n = 1

Define a function to save the data from beautifulsop to Excel

def save_to_excel(soup):
    list = soup.find(class_='grid_view').find_all('li')

    for item in list:
        item_name = item.find(class_='title').string
        item_img = item.find('a').find('img').get('src')
        item_index = item.find(class_='').string
        item_score = item.find(class_='rating_num').string
        item_author = item.find('p').text
        if (item.find(class_='inq') != None):
            item_intr = item.find(class_='inq').string

        # print('crawling movie: '+ item_index +' | + item_name + '| + item_img +' | + item_score + '| + item_author +' | + item_intr)
        print('Crawling movie:' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intr)

        global n

        sheet.write(n, 0, item_name)
        sheet.write(n, 1, item_img)
        sheet.write(n, 2, item_index)
        sheet.write(n, 3, item_score)
        sheet.write(n, 4, item_author)
        sheet.write(n, 5, item_intr)

        n = n + 1

Define the main function incoming URL and store, call the main function

def main(page):
    url = 'https://movie.douban.com/top250?start=' + str(page * 25) + '&filter='
    html = request_douban(url)
    soup = BeautifulSoup(html, 'lxml')
    save_to_excel(soup)


if __name__ == '__main__':

    for i in range(0, 10):
        main(i)

After running, I found that the file "Douban's 250 most popular movies. csv" was added to the folder. Open it and have a look

Tags: Programming Selenium Excel encoding Python

Posted on Wed, 04 Dec 2019 13:01:43 -0800 by lou28