【爬虫】利用python批量爬取人民教育出版社电子教材
爬取完成:示例代码:
# coding:utf-8
'''
批量下载人民教育出版社电子教材(根据个人电脑环境参数设定)
'''
import requests
import re
# -- 参数设定 ------------------------------------------------------------------------------------------
pageBegin = 758857 # 起始页码
pageEnd = 758828 # 截止页码
path = "C:\\Users\\hkcmd\\Desktop\\path\\" # 自定义桌面路径,需要事先创建path文件夹
rootA = "http://old.pep.com.cn/gzsx/jszx_1/czsxtbjxzy/qrzptgjzxjc/dzkb/decxza/201008" # 爬取Url根路径
# -- 结束设定 ------------------------------------------------------------------------------------------
pattern1 = re.compile("<IMG.*>", re.M)
pattern2 = re.compile("/.*jpg", re.M)
num = pageBegin - pageEnd + 1 # 页数
rootB = "http://old.pep.com.cn"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'}
for i in range(num):
file = "/t20100826_" + str(pageBegin - i) + ".htm"
pageUrl = rootA + file
r = requests.get(pageUrl, headers = headers)
if r.status_code == 200:
imgStr = re.search(pattern1, r.text)
srcStr = re.search(pattern2, imgStr.group()).group()
if re.search('\"', srcStr) == None:
imgUrl = rootB + srcStr
else:
srcStrx = re.search('/.*\"', srcStr).group()
imgUrl = rootA + srcStrx
img = requests.get(imgUrl, headers = headers)
open(path + 'img' + str(i) + '.jpg', 'wb').write(img.content) # 将内容写入图片
else:
print(r.status_code)
print("done")
页:
[1]