使用Beautifulsoup和requests爬虫豆瓣电影
只用30行代码
代码如下:
import requests
from bs4 import BeautifulSoup
import pandas as pd
hrefs, titles, actors, ratings, quotes = [[] for i in range(5)]
result = [hrefs, titles, actors, ratings, quotes]
for page in range(0, 250, 25):
    url = f"https://movie.douban.com/top250?start={page}&filter="
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
                             " AppleWebKit/537.36 (KHTML, like Gecko)"
                             " Chrome/108.0.0.0 Safari/537.36"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "lxml")
    infos = soup.find_all("div", class_="info")
    print("*"*50)
    for index, info in enumerate(infos, start=1):
        print(index+page)
        href = info.find("a")["href"]
        title = info.find("a").get_text().replace("\n", "").replace(" ", "")
        actor = info.find("p").get_text().replace("\n", "").replace(" ", "")
        rating = info.find("span", class_="rating_num").get_text()
        if info.find("span", class_="inq") is None:
            quote = "nothing"
        else:
            quote = info.find("span", class_="inq").get_text()
        one_result = [href, title, actor, rating, quote]
        for i in one_result:
            print(i)
        for i in range(5):
            result[i].append(one_result[i])
column_name = ["href", "title", "actor", "rating", "quote"]
df = pd.DataFrame({column_name[i]: result[i] for i in range(5)})
df.to_csv("douban.csv")