0x01代码
from bs4 import BeautifulSoup
import requests
import time
import re
from multiprocessing import Process
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 6.1; W…) Gecko/20100101 Firefox/61.0'
}
def get_url(url):
res = requests.get(url)
if res.status_code == 200:
# soup = BeautifulSoup(res.text,'html.parser')
# re = soup.find('div',class_='container').find_all('li')
# for i in re:
# s = i.find('img').get('src')
# print(s)
# li.append(s)
html_text = res.text;
img_list = re.findall('<img class="lazy" src="(.*?)"',html_text)
return img_list
def get_img_url(url):
#url = "//img2.woyaogexing.com/2021/03/31/8abad21b96f74d7d8bdde4382245dca3!360x640.jpeg"
url_s = url.split("!")
url_url =""
if len(url_s) == 2:
url_s1 = url_s[0]
if "." in url_s[1]:
url_s2 = url_s[1].split(".")[-1]
url_url = "https:" + url_s1 + "." + url_s2
return url_url;
def get_img(url):
# s = url.split('/')[-1].split('_')[0]
# print(s)
# url = "https:"+url
# print(url)
url = get_img_url(url)
print(url)
if "woyaogexing.com" not in url:
return
r = requests.get(url)
if r.status_code == 200:
img_path = "mingxing/"+url.split('/')[-1]
with open(img_path,'wb') as f:
f.write(r.content)
if __name__ == '__main__':
#url = 'https://www.woyaogexing.com/shouji/mingxing/'
for i in range(2,100):
url = "https://www.woyaogexing.com/tupian/dongman/index_"+str(i)+".html"
img_url = get_url(url)
start = time.time()
p_l = []
for i in img_url:
get_img(i)
# p = Process(target=get_img,args=(i,))
# p.start()
# p_l.append(p)
#[i.join() for i in p_l]
print(time.time() - start)
0x02待优化
等待优化为多线程的爬取方式
0则评论给“python-单线程图片爬取”