豆瓣电影爬虫脚本

import re
from urllib.request import urlopen, Request

def getPage(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    ret = Request(url, headers=headers)
    res = urlopen(ret)
    return res.read().decode('utf-8')

def parsePage(s):   # s 网页源码
    ret = com.finditer(s)
    for i in ret:
        ret = {
            "id": i.group("id"),
            "title": i.group("title"),
            "rating_num": i.group("rating_num"),
            "comment_num": i.group("comment_num")
        }
        yield ret

def main(num):
    url = 'https://movie.douban.com/top250?start=%s&filter=' % num  # 0
    response_html = getPage(url)   # response_html是这个网页的源码 str
    ret = parsePage(response_html) # 生成器
    print(ret)
    f = open("move_info7", "a", encoding="utf8")
    for obj in ret:
        print(obj)
        data = str(obj)
        f.write(data + "\n")
    f.close()

com = re.compile(
        '<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>'
        '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>', re.S)
count = 0
for i in range(10):
    main(count)  # count = 0
    count += 25

常用伪装如下:

安装 fake-useragent

pip install fake-useragent

pip install -U fake-useragent


测试如下

from fake_useragent import UserAgent
ua = UserAgent()
#ie浏览器的user agent
print(ua.ie)
 
#opera浏览器
print(ua.opera)
 
#chrome浏览器
print(ua.chrome)
 
#firefox浏览器
print(ua.firefox)
 
#safri浏览器
print(ua.safari)
 
 
# 随意变换headers
print(ua.random)
print(ua.random)


豆瓣有反扒机制,需要实践测试





原文链接: 豆瓣电影爬虫脚本 版权所有,转载时请注明出处,违者必究。
注明出处格式:流沙 ( https://gyarmy.com/post-631.html )

发表评论

0则评论给“豆瓣电影爬虫脚本”