01 | import re |
02 | from urllib.request import urlopen, Request |
03 |
04 | def getPage(url): |
05 | headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } |
06 | ret = Request(url, headers = headers) |
07 | res = urlopen(ret) |
08 | return res.read().decode( 'utf-8' ) |
09 |
10 | def parsePage(s): # s 网页源码 |
11 | ret = com.finditer(s) |
12 | for i in ret: |
13 | ret = { |
14 | "id" : i.group( "id" ), |
15 | "title" : i.group( "title" ), |
16 | "rating_num" : i.group( "rating_num" ), |
17 | "comment_num" : i.group( "comment_num" ) |
18 | } |
19 | yield ret |
20 |
21 | def main(num): |
22 | url = 'https://movie.douban.com/top250?start=%s&filter=' % num # 0 |
23 | response_html = getPage(url) # response_html是这个网页的源码 str |
24 | ret = parsePage(response_html) # 生成器 |
25 | print (ret) |
26 | f = open ( "move_info7" , "a" , encoding = "utf8" ) |
27 | for obj in ret: |
28 | print (obj) |
29 | data = str (obj) |
30 | f.write(data + "\n" ) |
31 | f.close() |
32 |
33 | com = re. compile ( |
34 | '<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>' |
35 | '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>' , re.S) |
36 | count = 0 |
37 | for i in range ( 10 ): |
38 | main(count) # count = 0 |
39 | count + = 25 |
常用伪装如下:
安装 fake-useragent
pip install fake-useragent
pip install -U fake-useragent
测试如下
01 | from fake_useragent import UserAgent |
02 | ua = UserAgent() |
03 | #ie浏览器的user agent |
04 | print (ua.ie) |
05 | |
06 | #opera浏览器 |
07 | print (ua.opera) |
08 | |
09 | #chrome浏览器 |
10 | print (ua.chrome) |
11 | |
12 | #firefox浏览器 |
13 | print (ua.firefox) |
14 | |
15 | #safri浏览器 |
16 | print (ua.safari) |
17 | |
18 | |
19 | # 随意变换headers |
20 | print (ua.random) |
21 | print (ua.random) |
豆瓣有反扒机制,需要实践测试
0则评论给“豆瓣电影爬虫脚本”