豆瓣电影 Top 250
1.BeautifulSoup初始化
soup = BeautifulSoup(html, 'html.parser')
2.find和find_all
查找某个元素,返回一个结果
soup.find("标签名", attrs={"属性": "值"})
查找某个元素,返回所有符合条件结果
result = soup.find_all("标签名", attrs={"属性": "值"})
取文本
result.text
取属性
3.250电影排名实例
result.get("属性名")
import requests
from bs4 import BeautifulSoup
import pprint
import json
import pandas
from my_fake_useragent import UserAgent
page_indexs = range(0, 250, 25)
def down_htmls():
htmls = []
for idx in page_indexs:
url = f"https://movie.douban.com/top250?start={idx}&filter"
print("craw html:", url)
headers = {'User-Agent': UserAgent().random()}
r = requests.get(url, headers=headers)
if r.status_code != 200:
raise Exception("error")
htmls.append(r.text)
return htmls
htmls = down_htmls()
def extract_single_html(html):
soup = BeautifulSoup(html, 'html.parser')
article_items = soup.find("div", class_="article").find("ol", class_="grid_view").find_all("li")
datas = []
for article_item in article_items:
rank = article_item.find("em",).get_text()
title = article_item.find("span", class_="title").get_text()
stars = article_item.find("div", class_="star")
rating_star = stars.find("span")["class"][0]
rating_num = stars.find("span",class_="rating_num").get_text()
comments_num = stars.find_all("span")[3].get_text()
datas.append(
{
"rank":rank,
"title":title,
"rating_star":rating_star.replace("rating", "").replace("-t", ""),
"rating_num":rating_num,
"comments_num":comments_num
}
)
return datas
# datas = extract_single_html(htmls[0])
# pprint(datas)
all_datas = []
for html in htmls:
all_datas.extend((extract_single_html(html)))
df = pandas.DataFrame(all_datas)
df.to_excel("250rankMovies.xlsx")